import pandas as pd
from methods import Train, visualization,Samples
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error,r2_score ,mean_squared_error
from tqdm import tqdm
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
import matplotlib.pyplot as plt
from pmdarima import auto_arima
warnings.filterwarnings('ignore')
train = Train()
vs = visualization()
sp = Samples()
C:\Users\amine\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
data= pd.read_excel("Superstore_Sales.xls")
data.head()
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | ... | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | CA-2013-152156 | 2013-11-09 | 2013-11-12 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | 2 | CA-2013-152156 | 2013-11-09 | 2013-11-12 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | 3 | CA-2013-138688 | 2013-06-13 | 2013-06-17 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | ... | 90036 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | 4 | US-2012-108966 | 2012-10-11 | 2012-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | 5 | US-2012-108966 | 2012-10-11 | 2012-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
5 rows × 21 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9994 entries, 0 to 9993 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Row ID 9994 non-null int64 1 Order ID 9994 non-null object 2 Order Date 9994 non-null datetime64[ns] 3 Ship Date 9994 non-null datetime64[ns] 4 Ship Mode 9994 non-null object 5 Customer ID 9994 non-null object 6 Customer Name 9994 non-null object 7 Segment 9994 non-null object 8 Country 9994 non-null object 9 City 9994 non-null object 10 State 9994 non-null object 11 Postal Code 9994 non-null int64 12 Region 9994 non-null object 13 Product ID 9994 non-null object 14 Category 9994 non-null object 15 Sub-Category 9994 non-null object 16 Product Name 9994 non-null object 17 Sales 9994 non-null float64 18 Quantity 9994 non-null int64 19 Discount 9994 non-null float64 20 Profit 9994 non-null float64 dtypes: datetime64[ns](2), float64(3), int64(3), object(13) memory usage: 1.6+ MB
print("data informations:")
print()
start = data["Order Date"].min().date()
end = data["Order Date"].max().date()
print(f"data range : {start} -> {end}")
print(f"sample shape : {data.shape}")
number_ship_modes = len(data["Ship Mode"].unique())
data informations: data range : 2011-01-04 -> 2014-12-31 sample shape : (9994, 21)
categorical_columns = ["Ship Mode","Segment","Country","City","State","Region","Category","Sub-Category","Product Name"]
print("data informations:")
print()
for column in categorical_columns:
number = len(data[column].unique())
print(f"Number of {column} : {number}")
data informations: Number of Ship Mode : 4 Number of Segment : 3 Number of Country : 1 Number of City : 531 Number of State : 49 Number of Region : 4 Number of Category : 3 Number of Sub-Category : 17 Number of Product Name : 1850
# m = monthly
general_data = sp.generalData(data,"m")
general_data.head()
| Sales | |
|---|---|
| 2011-01-31 | 13946.229 |
| 2011-02-28 | 4810.558 |
| 2011-03-31 | 55691.009 |
| 2011-04-30 | 28295.345 |
| 2011-05-31 | 23648.287 |
#split data into training and testing splits (we'll need them later)
data_train , data_test = train.split_data(general_data,0.75)
print(f"Train size: {data_train.shape[0]}")
print(f"Test size: {data_test.shape[0]}")
Train size: 36 Test size: 12
vs.lineplot(general_data,"Monthly")
#trend
vs.trend_plot(general_data,"Monthly")
#Seasonality
vs.seasonality_plot(general_data,"Monthly")
#Residuals
vs.residual_plot(general_data,"Monthly")
result = train.adf_test(general_data["Sales"])
result
| Statistical test | P-values | Lags used | Number of observations | |
|---|---|---|---|---|
| 0 | -4.302935 | 0.000439 | 0 | 47 |
H0: data is not stationnary
H1: data is stationnary
p value < 0.05 => reject H0 => so based on ADF test data is stationnary
(but let take a look in shorter horizons (using acf and pacf plot ))
#ACF
vs.acf_plot(general_data,"monthly")
#PACF
vs.pacf_plot(general_data,"monthly")
we gonna automate ARIMA paramaters (using only the lag with a significant correlation) (similar to gridSearch)
models_arima = train.train_arima(general_data,0.75)
0%| | 0/1 [00:00<?, ?it/s]
ADF Statistic: -4.302935180048875 p-value: 0.0004390420862895829 acf [0, 1, 12] pacf [0, 1, 7, 11, 12, 13, 14, 15, 16] (ar,d,ma) : (0, 0, 0) (ar,d,ma) : (0, 0, 1) (ar,d,ma) : (0, 0, 12) (ar,d,ma) : (1, 0, 0) (ar,d,ma) : (1, 0, 1) (ar,d,ma) : (1, 0, 12) (ar,d,ma) : (7, 0, 0) (ar,d,ma) : (7, 0, 1) (ar,d,ma) : (7, 0, 12) (ar,d,ma) : (11, 0, 0) (ar,d,ma) : (11, 0, 1) (ar,d,ma) : (11, 0, 12) (ar,d,ma) : (12, 0, 0) (ar,d,ma) : (12, 0, 1) (ar,d,ma) : (12, 0, 12) (ar,d,ma) : (13, 0, 0) (ar,d,ma) : (13, 0, 1) (ar,d,ma) : (13, 0, 12) (ar,d,ma) : (14, 0, 0) (ar,d,ma) : (14, 0, 1) (ar,d,ma) : (14, 0, 12) (ar,d,ma) : (15, 0, 0) (ar,d,ma) : (15, 0, 1) (ar,d,ma) : (15, 0, 12) (ar,d,ma) : (16, 0, 0) (ar,d,ma) : (16, 0, 1) (ar,d,ma) : (16, 0, 12)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:53<00:00, 53.85s/it]
print("Our results:")
models_arima.head(5)
Our results:
| Model | Segment | ACF | d | PACF | Order | MAE | MAPE | RMSE | R2_Score | MAE_train | MAPE_train | R2_Score_train | RMSE_train | Y_test | Y_prediction | Y_prediction_train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 0 | 0 | 0 | (0, 0, 0) | 22147.117752 | 0.340696 | 30817.378021 | -0.495478 | 19627.813463 | 0.755736 | 0.000000 | 23033.252737 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 43423.717697 2014-02-28 43423... | 2011-01-31 43423.717697 2011-02-28 43423... |
| 1 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 1 | 0 | 0 | (0, 0, 1) | 22875.367438 | 0.356987 | 30950.580609 | -0.508433 | 18964.650570 | 0.697038 | 0.053323 | 22410.735442 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 54721.562540 2014-02-28 43423... | 2011-01-31 43423.717698 2011-02-28 36355... |
| 2 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 12 | 0 | 0 | (0, 0, 12) | 26795.023306 | 0.397330 | 35990.127484 | -1.039648 | 15174.124037 | 0.623168 | 0.403197 | 17793.881100 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 53205.835865 2014-02-28 37816... | 2011-01-31 43423.717747 2011-02-28 34359... |
| 3 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 0 | 0 | 1 | (1, 0, 0) | 23332.790980 | 0.375692 | 31249.289049 | -0.537690 | 18989.376768 | 0.700456 | 0.054599 | 22395.633276 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 57373.833719 2014-02-28 47040... | 2011-01-31 43423.717698 2011-02-28 35782... |
| 4 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 1 | 0 | 1 | (1, 0, 1) | 23634.997319 | 0.397574 | 31680.445957 | -0.580415 | 18783.200204 | 0.699737 | 0.057506 | 22361.174600 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 60253.713383 2014-02-28 52669... | 2011-01-31 43423.717720 2011-02-28 35324... |
max_r2score_train = max(models_arima["R2_Score_train"])
max_r2score_test = max(models_arima["R2_Score"])
min_mape_train=min(models_arima["MAPE_train"])
min_mape_test = min(models_arima["MAPE"])
print(f"the hightest R2_score (on train data , to test how much the model fits the training data ): {max_r2score_train}")
print(f"the hightest R2_score on test data : {max_r2score_test}")
print()
print(f"the lowest mean absolute percentage error on trainset: {min_mape_train}")
print(f"the lowest mean absolute percentage error on test set: {min_mape_test}")
the hightest R2_score (on train data , to test how much the model fits the training data ): 0.621310742492378 the hightest R2_score on test data : 0.38209818883040014 the lowest mean absolute percentage error on trainset: 0.4733625267185262 the lowest mean absolute percentage error on test set: 0.256347867351916
print("the model that best fits the training set :")
models_arima[models_arima["R2_Score_train"] == max_r2score_train]
the model that best fits the training set :
| Model | Segment | ACF | d | PACF | Order | MAE | MAPE | RMSE | R2_Score | MAE_train | MAPE_train | R2_Score_train | RMSE_train | Y_test | Y_prediction | Y_prediction_train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 1 | 0 | 16 | (16, 0, 1) | 16120.950262 | 0.277508 | 20831.330919 | 0.316682 | 10824.025256 | 0.473363 | 0.621311 | 14174.141556 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 42751.685034 2014-02-28 35307... | 2011-01-31 43423.717892 2011-02-28 39287... |
print("The model with best performance on test set :")
models_arima[models_arima["R2_Score"] == max_r2score_test]
The model with best performance on test set :
| Model | Segment | ACF | d | PACF | Order | MAE | MAPE | RMSE | R2_Score | MAE_train | MAPE_train | R2_Score_train | RMSE_train | Y_test | Y_prediction | Y_prediction_train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 0 | 0 | 12 | (12, 0, 0) | 16183.870802 | 0.256348 | 19809.133067 | 0.382098 | 12281.260775 | 0.509558 | 0.571178 | 15083.203632 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 32588.926931 2014-02-28 24658... | 2011-01-31 43423.717753 2011-02-28 39133... |
but based on ACF and PACF we observed that the first lags in ACF and PACF are out of the significance band
lets explore its performance
models_arima.loc[(models_arima["ACF"] == 1) & (models_arima["PACF"] ==1) ]
| Model | Segment | ACF | d | PACF | Order | MAE | MAPE | RMSE | R2_Score | MAE_train | MAPE_train | R2_Score_train | RMSE_train | Y_test | Y_prediction | Y_prediction_train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 1 | 0 | 1 | (1, 0, 1) | 23634.997319 | 0.397574 | 31680.445957 | -0.580415 | 18783.200204 | 0.699737 | 0.057506 | 22361.1746 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 60253.713383 2014-02-28 52669... | 2011-01-31 43423.717720 2011-02-28 35324... |
plt.title("Forecasts against Training")
plt.plot(data_train["Sales"],label="actual trainning data")
plt.plot(models_arima["Y_prediction_train"][25],label="predicted training data ARIMA(16,0,1)")
plt.plot(models_arima["Y_prediction_train"][12],label="predicted training data ARIMA(12,0,0)")
plt.plot(models_arima["Y_prediction_train"][4],label="predicted training data ARIMA(1,0,1)")
plt.legend()
<matplotlib.legend.Legend at 0x27847a83580>
plt.title("Forecasts against Test sets")
plt.plot(data_test["Sales"],label="actual test data")
plt.plot(models_arima["Y_prediction"][25],label="predicted test data ARIMA(16,0,1)")
plt.plot(models_arima["Y_prediction"][12],label="predicted test data ARIMA(12,0,0)")
plt.plot(models_arima["Y_prediction"][4],label="predicted test data ARIMA(1,0,1)")
plt.legend()
<matplotlib.legend.Legend at 0x27847c84850>
evaluation = {
"Models":["ARIMA(1,0,1)","ARIMA(12,0,0)","ARIMA(16,0,1)"],
"R2_score_train":[models_arima["R2_Score_train"][4],models_arima["R2_Score_train"][12],models_arima["R2_Score_train"][25]],
"R2_score_test":[models_arima["R2_Score"][4],models_arima["R2_Score"][12],models_arima["R2_Score"][25]],
"MAE_train":[models_arima["MAE_train"][4],models_arima["MAE_train"][12],models_arima["MAE_train"][25]],
"MAE_test":[models_arima["MAE"][4],models_arima["MAE"][12],models_arima["MAE"][25]],
"RMSE_train":[models_arima["RMSE_train"][4],models_arima["RMSE_train"][12],models_arima["RMSE_train"][25]],
"RMSE_test":[models_arima["RMSE"][4],models_arima["RMSE"][12],models_arima["RMSE"][25]],
"MAPE_train":[models_arima["MAPE_train"][4],models_arima["MAPE_train"][12],models_arima["MAPE_train"][25]],
"MAPE_test":[models_arima["MAPE"][4],models_arima["MAPE"][12],models_arima["MAPE"][25]],
}
evaluation = pd.DataFrame(evaluation).set_index("Models")
evaluation
| R2_score_train | R2_score_test | MAE_train | MAE_test | RMSE_train | RMSE_test | MAPE_train | MAPE_test | |
|---|---|---|---|---|---|---|---|---|
| Models | ||||||||
| ARIMA(1,0,1) | 0.057506 | -0.580415 | 18783.200204 | 23634.997319 | 22361.174600 | 31680.445957 | 0.699737 | 0.397574 |
| ARIMA(12,0,0) | 0.571178 | 0.382098 | 12281.260775 | 16183.870802 | 15083.203632 | 19809.133067 | 0.509558 | 0.256348 |
| ARIMA(16,0,1) | 0.621311 | 0.316682 | 10824.025256 | 16120.950262 | 14174.141556 | 20831.330919 | 0.473363 | 0.277508 |
StatsModels in Python We can't choose a AR lag or MA lag that is greater than seasonal lag .
so we can use only ar = [1,7,11] and ma =[1]
identify the SARIMA(p,d,q)(P,D,Q,S) parameters :
p = 1 , 7 or 11 (from ACF plot)
d = 0 (because we didnt differenciate our data)
q = 1 (from PACF plot )
S = 12 (base on seasonal plot)
D = 1 (based on seasonal plot , there is a stable seasonal pattern)
P >= 1 : beceause the PACF is positive at lag S (=12)
Q >= 1 : beceause the ACF is positive at lag S
NB: P + Q <= 2 (Rule of thumb)
def initialize_sarima_dict():
result = {
'Model':[],
'Segment':[],
'ACF':[],
'd':[],
'PACF':[],
'Order':[],
"P":[],
"D":[],
"Q":[],
"S":[],
'MAE':[],
'MAPE':[],
'RMSE':[],
'R2_Score':[],
'MAE_train':[],
'MAPE_train':[],
'R2_Score_train':[],
'RMSE_train':[],
'Y_test':[],
'Y_prediction':[],
"Y_prediction_train":[],
}
return result
def train_sarima(data_train,data_test,column_name,evaluation,p,d,q,upperP,D,upperQ,S):
sarima_model = ARIMA(data_train,
order = (p,d,q),
seasonal_order = (upperP,D,upperQ,S)
).fit()
#make forecasts
y_pred_train = sarima_model.predict()
y_pred_test = sarima_model.forecast(len(data_test))
#evaluate
order = (p,d,q)
#get train_subset(eliminet first S months)
data_train = data_train[S:]
y_pred_train = y_pred_train[S:]
#evaluate training fit
mae_train = mean_absolute_error(data_train,y_pred_train)
mape_train = mean_absolute_percentage_error(data_train,y_pred_train)
r2score_train = r2_score(data_train,y_pred_train)
rmse_train = mean_squared_error(data_train,y_pred_train,squared=False)
#rolling forecast orgin(dynamic)
mae_forecast = mean_absolute_error(data_test,y_pred_test)
mape_forecast = mean_absolute_percentage_error(data_test,y_pred_test)
r2score_forecast = r2_score(data_test,y_pred_test)
rmse_forecast = mean_squared_error(data_test,y_pred_test,squared=False)
#add row . (save model infos)
evaluation['Model'].append(sarima_model)
evaluation['Segment'].append(column_name)
evaluation['ACF'].append(q)
evaluation['d'].append(d)
evaluation['PACF'].append(p)
evaluation['Order'].append(order)
evaluation['P'].append(upperP)
evaluation['Q'].append(upperQ)
evaluation['D'].append(D)
evaluation['S'].append(S)
evaluation['MAE'].append(mae_forecast)
evaluation['MAPE'].append(mape_forecast)
evaluation['R2_Score'].append(r2score_forecast)
evaluation['RMSE'].append(rmse_forecast)
evaluation['MAE_train'].append(mae_train)
evaluation['MAPE_train'].append(mape_train)
evaluation['R2_Score_train'].append(r2score_train)
evaluation['RMSE_train'].append(rmse_train)
evaluation['Y_test'].append(data_test)
evaluation['Y_prediction'].append(y_pred_test)
evaluation['Y_prediction_train'].append(y_pred_train)
return pd.DataFrame(evaluation)
#initialize paramaters
p_list= [0,1,7,11]
d= 0
q= 1
S= 12
D=1
upperP_list=[0,1,2]
Q_list=[0,1,2]
results_sarima = initialize_sarima_dict()
results_sarima = pd.DataFrame(results_sarima)
#train ARIMA
for p in p_list:
for upperP in upperP_list:
for upperQ in Q_list:
#contraint Q + P <=2
if(upperP + upperQ <= 2):
empty_dict = initialize_sarima_dict()
#fit sarima model
row = train_sarima(data_train["Sales"],data_test["Sales"],"Sales",empty_dict,p,d,q,upperP,D,upperQ,S)
results_sarima = pd.concat([results_sarima,row],axis=0)
new_index = pd.Series([i for i in range(results_sarima.shape[0])])
new_index
results_sarima = results_sarima.set_index(new_index)
results_sarima.head(5)
| Model | Segment | ACF | d | PACF | Order | P | D | Q | S | ... | MAPE | RMSE | R2_Score | MAE_train | MAPE_train | R2_Score_train | RMSE_train | Y_test | Y_prediction | Y_prediction_train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 1.0 | 0.0 | 0.0 | (0, 0, 1) | 0.0 | 1.0 | 0.0 | 12.0 | ... | 0.218698 | 16776.366033 | 0.556816 | 9604.033028 | 0.238234 | 0.715988 | 11900.595661 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 21315.86516 2014-02-28 22867.... | 2012-01-31 21489.696855 2012-02-29 4390... |
| 1 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 1.0 | 0.0 | 0.0 | (0, 0, 1) | 0.0 | 1.0 | 1.0 | 12.0 | ... | 0.216438 | 16945.120914 | 0.547855 | 9430.524622 | 0.235910 | 0.730877 | 11584.460408 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 22041.096465 2014-02-28 21996... | 2012-01-31 21846.542567 2012-02-29 3862... |
| 2 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 1.0 | 0.0 | 0.0 | (0, 0, 1) | 0.0 | 1.0 | 2.0 | 12.0 | ... | 0.235631 | 16992.819975 | 0.545306 | 7937.751452 | 0.212043 | 0.820133 | 9470.564637 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 19599.207038 2014-02-28 23932... | 2012-01-31 11519.461917 2012-02-29 4353... |
| 3 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 1.0 | 0.0 | 0.0 | (0, 0, 1) | 1.0 | 1.0 | 0.0 | 12.0 | ... | 0.216299 | 17039.431012 | 0.542808 | 9327.947045 | 0.233905 | 0.736637 | 11459.827552 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 22227.141629 2014-02-28 21729... | 2012-01-31 21775.907534 2012-02-29 3698... |
| 4 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 1.0 | 0.0 | 0.0 | (0, 0, 1) | 1.0 | 1.0 | 1.0 | 12.0 | ... | 0.267080 | 20714.433010 | 0.324330 | 8234.904328 | 0.209317 | 0.791649 | 10192.921132 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 18684.944555 2014-02-28 21990... | 2012-01-31 16367.950943 2012-02-29 4281... |
5 rows × 21 columns
max_r2score_sarima_train = max(results_sarima["R2_Score_train"])
max_r2score_sarima_test = max(results_sarima["R2_Score"])
min_mape_sarima_train=min(results_sarima["MAPE_train"])
min_mape_sarima_test = min(results_sarima["MAPE"])
print(f"the hightest R2_score (on train data , to test how much the model fits the training data ): {max_r2score_sarima_train}")
print(f"the hightest R2_score on test data : {max_r2score_sarima_test}")
print()
print(f"the lowest mean absolute percentage error on trainset: {min_mape_sarima_train}")
print(f"the lowest mean absolute percentage error on test set: {min_mape_sarima_test}")
the hightest R2_score (on train data , to test how much the model fits the training data ): 0.8920357670221702 the hightest R2_score on test data : 0.7433493792977566 the lowest mean absolute percentage error on trainset: 0.16160390227808077 the lowest mean absolute percentage error on test set: 0.20528707298264692
print("the model that best fits the training set :")
results_sarima[results_sarima["R2_Score_train"] == max_r2score_sarima_train]
the model that best fits the training set :
| Model | Segment | ACF | d | PACF | Order | P | D | Q | S | ... | MAPE | RMSE | R2_Score | MAE_train | MAPE_train | R2_Score_train | RMSE_train | Y_test | Y_prediction | Y_prediction_train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 21 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 1.0 | 0.0 | 11.0 | (11, 0, 1) | 1.0 | 1.0 | 0.0 | 12.0 | ... | 0.218682 | 13428.25399 | 0.716059 | 6100.436514 | 0.161604 | 0.892036 | 7337.374139 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 26481.701069 2014-02-28 312... | 2012-01-31 12299.623354 2012-02-29 13020... |
1 rows × 21 columns
print("The model with best performance on test set :")
results_sarima[results_sarima["R2_Score"] == max_r2score_sarima_test]
The model with best performance on test set :
| Model | Segment | ACF | d | PACF | Order | P | D | Q | S | ... | MAPE | RMSE | R2_Score | MAE_train | MAPE_train | R2_Score_train | RMSE_train | Y_test | Y_prediction | Y_prediction_train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales | 1.0 | 0.0 | 1.0 | (1, 0, 1) | 1.0 | 1.0 | 0.0 | 12.0 | ... | 0.205828 | 12766.648308 | 0.743349 | 6837.146239 | 0.167065 | 0.855312 | 8494.09819 | 2014-01-31 44703.1420 2014-02-28 20283... | 2014-01-31 28487.875662 2014-02-28 301... | 2012-01-31 20410.055811 2012-02-29 13521... |
1 rows × 21 columns
plt.title("Forecasts against Training")
plt.plot(data_train["Sales"],label="actual trainning data")
plt.plot(results_sarima["Y_prediction_train"][21],label="predicted training data SARIMA(11,0,1)(1,1,0,12)")
plt.plot(results_sarima["Y_prediction_train"][9],label="predicted training data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x2784817ae60>
plt.title("Forecasts against Testing")
plt.plot(data_test["Sales"],label="actual Testing data")
plt.plot(results_sarima["Y_prediction"][21],label="predicted Testing data SARIMA(11,0,1)(1,1,0,12)")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x278482002b0>
evaluation_SARIMA = {
"Models":["SARIMA(11,0,1)(1,1,0,12)","SARIMA(1,0,1)(1,1,0,12)"],
"R2_score_train":[results_sarima["R2_Score_train"][21],results_sarima["R2_Score_train"][9]],
"R2_score_test":[results_sarima["R2_Score"][21],results_sarima["R2_Score"][9]],
"MAE_train":[results_sarima["MAE_train"][21],results_sarima["MAE_train"][9]],
"MAE_test":[results_sarima["MAE"][21],results_sarima["MAE"][9]],
"RMSE_train":[results_sarima["RMSE_train"][21],results_sarima["RMSE_train"][9]],
"RMSE_test":[results_sarima["RMSE"][21],results_sarima["RMSE"][9]],
"MAPE_train":[results_sarima["MAPE_train"][21],results_sarima["MAPE_train"][9]],
"MAPE_test":[results_sarima["MAPE"][21],results_sarima["MAPE"][9]],
}
evaluation_SARIMA = pd.DataFrame(evaluation_SARIMA).set_index("Models")
evaluation_SARIMA
| R2_score_train | R2_score_test | MAE_train | MAE_test | RMSE_train | RMSE_test | MAPE_train | MAPE_test | |
|---|---|---|---|---|---|---|---|---|
| Models | ||||||||
| SARIMA(11,0,1)(1,1,0,12) | 0.892036 | 0.716059 | 6100.436514 | 11702.058017 | 7337.374139 | 13428.253990 | 0.161604 | 0.218682 |
| SARIMA(1,0,1)(1,1,0,12) | 0.855312 | 0.743349 | 6837.146239 | 11230.945070 | 8494.098190 | 12766.648308 | 0.167065 | 0.205828 |
import numpy as np
np.sqrt(results_sarima["RMSE_train"][21])
85.65847382940096
auto_arima = auto_arima(data_train["Sales"],m=12,
max_order=None, max_p=12,max_d=4,max_q=12,
max_P=4,max_D=2,max_Q=4,
maxiter=500, alpha=0.05, n_jobs= -1,trend=[1,1,1,1])
auto_arima.summary()
| Dep. Variable: | y | No. Observations: | 36 |
|---|---|---|---|
| Model: | SARIMAX(1, 0, 0, 12) | Log Likelihood | -400.754 |
| Date: | Sun, 23 Jun 2024 | AIC | 813.507 |
| Time: | 12:35:23 | BIC | 823.008 |
| Sample: | 01-31-2011 | HQIC | 816.823 |
| - 12-31-2013 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| intercept | 8098.0140 | 1.41e+04 | 0.575 | 0.565 | -1.95e+04 | 3.57e+04 |
| drift | 7511.8488 | 4164.171 | 1.804 | 0.071 | -649.777 | 1.57e+04 |
| trend.2 | -572.5466 | 339.955 | -1.684 | 0.092 | -1238.845 | 93.752 |
| trend.3 | 11.5402 | 7.193 | 1.604 | 0.109 | -2.558 | 25.639 |
| ar.S.L12 | 0.4444 | 0.256 | 1.733 | 0.083 | -0.058 | 0.947 |
| sigma2 | 3.609e+08 | 0.094 | 3.86e+09 | 0.000 | 3.61e+08 | 3.61e+08 |
| Ljung-Box (L1) (Q): | 0.41 | Jarque-Bera (JB): | 2.92 |
|---|---|---|---|
| Prob(Q): | 0.52 | Prob(JB): | 0.23 |
| Heteroskedasticity (H): | 0.28 | Skew: | 0.58 |
| Prob(H) (two-sided): | 0.04 | Kurtosis: | 2.23 |
=> so based on auto_arima the best model is ARIMA(1, 0, 0)(1, 0, 0)[12]
best_auto_arima_model = ARIMA(data_train["Sales"], order=(1, 0, 0), seasonal_order=(1, 0, 0, 12)).fit()
y_pred_train_best_AAM= best_auto_arima_model.predict()
y_pred_test_best_AAM = best_auto_arima_model.forecast(len(data_test["Sales"]))
plt.title("Forecasts against Training")
plt.plot(data_train["Sales"],label="actual trainning data")
plt.plot(y_pred_train_best_AAM,label="predicted training data ARIMA(1, 0, 0)(1, 0, 0)[12]")
plt.legend()
<matplotlib.legend.Legend at 0x278481b5990>
plt.title("Forecasts against Testing")
plt.plot(data_test["Sales"],label="actual Testing data")
plt.plot(y_pred_test_best_AAM,label="predicted Testing data ARIMA(1, 0, 0)(1, 0, 0)[12]")
plt.legend()
<matplotlib.legend.Legend at 0x2784a8ea530>
#train
best_auto_arima_model_r2score_train = r2_score(data_train["Sales"],y_pred_train_best_AAM)
best_auto_arima_model_mae_train = mean_absolute_error(data_train["Sales"],y_pred_train_best_AAM)
best_auto_arima_model_mape_train = mean_absolute_percentage_error(data_train["Sales"],y_pred_train_best_AAM)
best_auto_arima_model_rmse_train = mean_squared_error(data_train["Sales"],y_pred_train_best_AAM)
#test
best_auto_arima_model_r2score_test = r2_score(data_test["Sales"],y_pred_test_best_AAM)
best_auto_arima_model_mae_test = mean_absolute_error(data_test["Sales"],y_pred_test_best_AAM)
best_auto_arima_model_mape_test = mean_absolute_percentage_error(data_test["Sales"],y_pred_test_best_AAM)
best_auto_arima_model_rmse_test = mean_squared_error(data_test["Sales"],y_pred_test_best_AAM)
evaluation_best_auto_arima_model = {
"Models":["ARIMA(1, 0, 0)(1, 0, 0)[12]"],
"R2_score_train":[best_auto_arima_model_r2score_train],
"R2_score_test":[best_auto_arima_model_r2score_test],
"MAE_train":[best_auto_arima_model_mae_train],
"MAE_test":[best_auto_arima_model_mae_test],
"RMSE_train":[best_auto_arima_model_rmse_train],
"RMSE_test":[best_auto_arima_model_rmse_test],
"MAPE_train":[best_auto_arima_model_mape_train],
"MAPE_test":[best_auto_arima_model_mape_test],
}
evaluation_best_auto_arima_model = pd.DataFrame(evaluation_best_auto_arima_model).set_index("Models")
evaluation_best_auto_arima_model
| R2_score_train | R2_score_test | MAE_train | MAE_test | RMSE_train | RMSE_test | MAPE_train | MAPE_test | |
|---|---|---|---|---|---|---|---|---|
| Models | ||||||||
| ARIMA(1, 0, 0)(1, 0, 0)[12] | 0.447579 | 0.406889 | 13514.789728 | 15270.009035 | 2.930763e+08 | 3.766580e+08 | 0.517657 | 0.243752 |
=> We conclude that our function (train_Saarima) has better performance than auto_arima. Best model so far is SSARIMA(11,0,1)(1,1,0,12)
from darts import TimeSeries
series = general_data.copy()
series["Months"] = series.index
series = TimeSeries.from_dataframe(series,"Months",'Sales')
train, val = series.split_before(data_test.index[0])
from darts.utils.statistics import check_seasonality
check_seasonality(series,max_lag=120)
(True, 3)
from darts.models import ExponentialSmoothing
The StatsForecast module could not be imported. To enable support for the StatsForecastAutoARIMA, StatsForecastAutoETS and Croston models, please consider installing it.
model_ses = ExponentialSmoothing().fit(train)
predictions_ses_test = model_ses.predict(len(val),num_samples=1000)
#plt.plot(val,label="real test")
val.plot(label="actual test")
predictions_ses_test.plot(label="prediction")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x278669e3070>
from darts.metrics import mae,mape,r2_score,rmse
#test
ses_model_r2score_test = r2_score(predictions_ses_test,series)
ses_model_mae_test = mae(predictions_ses_test,series)
ses_model_mape_test = mape(predictions_ses_test,series)
ses_model_rmse_test = rmse(predictions_ses_test,series)
evaluation_ses_model = {
"Models":["SES"],
"R2_score_test":[ses_model_r2score_test],
"MAE_test":[ses_model_mae_test],
"RMSE_test":[ses_model_rmse_test],
"MAPE_test":[ses_model_mape_test],
}
evaluation_ses_model = pd.DataFrame(evaluation_ses_model).set_index("Models")
evaluation_ses_model
| R2_score_test | MAE_test | RMSE_test | MAPE_test | |
|---|---|---|---|---|
| Models | ||||
| SES | 0.770233 | 9486.114864 | 11021.648422 | 18.72872 |
from darts.models import Theta
model_theta = Theta()
model_theta.fit(train)
prediction_theta = model_theta.predict(len(val))
val.plot(label="actual test")
predictions_ses_test.plot(label="prediction SES")
prediction_theta.plot(label="prediction Theta")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x27866af1840>
#test
theta_model_r2score_test = r2_score(prediction_theta,series)
theta_model_mae_test = mae(prediction_theta,series)
theta_model_mape_test = mape(prediction_theta,series)
theta_model_rmse_test = rmse(prediction_theta,series)
evaluation_theta_model = {
"Models":["Theta"],
"R2_score_test":[theta_model_r2score_test],
"MAE_test":[theta_model_mae_test],
"RMSE_test":[theta_model_rmse_test],
"MAPE_test":[theta_model_mape_test],
}
evaluation_theta_model = pd.DataFrame(evaluation_theta_model).set_index("Models")
evaluation_theta_model
| R2_score_test | MAE_test | RMSE_test | MAPE_test | |
|---|---|---|---|---|
| Models | ||||
| Theta | -1.474407 | 24530.23595 | 29238.453618 | 32.817128 |
from darts.models import AutoARIMA
#seasonality = 3
darts_aarima = AutoARIMA(m=3, #seasonality
seasonal = True,
test= "adf",
d=None,
start_p = 0 , start_q = 0,
max_p = 12 , max_q = 12,
D=None,
start_P = 0 , start_Q = 0,
trace = True,
error_action = "ignore",
suppress_warnings = True,
stepwise = True
)
darts_aarima.fit(train)
predictions_darts_aarima = darts_aarima.predict(len(val))
Performing stepwise search to minimize aic ARIMA(0,2,0)(0,0,0)[3] : AIC=829.693, Time=0.02 sec ARIMA(1,2,0)(1,0,0)[3] : AIC=811.519, Time=0.06 sec ARIMA(0,2,1)(0,0,1)[3] : AIC=800.121, Time=0.15 sec ARIMA(0,2,1)(0,0,0)[3] : AIC=798.733, Time=0.06 sec ARIMA(0,2,1)(1,0,0)[3] : AIC=799.656, Time=0.12 sec ARIMA(0,2,1)(1,0,1)[3] : AIC=inf, Time=0.67 sec ARIMA(1,2,1)(0,0,0)[3] : AIC=794.822, Time=0.07 sec ARIMA(1,2,1)(1,0,0)[3] : AIC=795.858, Time=0.16 sec ARIMA(1,2,1)(0,0,1)[3] : AIC=796.205, Time=0.15 sec ARIMA(1,2,1)(1,0,1)[3] : AIC=793.861, Time=0.33 sec ARIMA(1,2,1)(2,0,1)[3] : AIC=796.142, Time=0.39 sec ARIMA(1,2,1)(1,0,2)[3] : AIC=795.691, Time=0.82 sec ARIMA(1,2,1)(0,0,2)[3] : AIC=797.026, Time=0.28 sec ARIMA(1,2,1)(2,0,0)[3] : AIC=795.768, Time=0.22 sec ARIMA(1,2,1)(2,0,2)[3] : AIC=795.885, Time=0.73 sec ARIMA(1,2,0)(1,0,1)[3] : AIC=808.745, Time=0.22 sec ARIMA(2,2,1)(1,0,1)[3] : AIC=792.797, Time=0.55 sec ARIMA(2,2,1)(0,0,1)[3] : AIC=791.684, Time=0.48 sec ARIMA(2,2,1)(0,0,0)[3] : AIC=793.161, Time=0.11 sec ARIMA(2,2,1)(0,0,2)[3] : AIC=792.511, Time=0.54 sec ARIMA(2,2,1)(1,0,0)[3] : AIC=793.958, Time=0.29 sec ARIMA(2,2,1)(1,0,2)[3] : AIC=793.294, Time=0.66 sec ARIMA(2,2,0)(0,0,1)[3] : AIC=794.161, Time=0.25 sec ARIMA(2,2,2)(0,0,1)[3] : AIC=791.772, Time=0.71 sec ARIMA(1,2,0)(0,0,1)[3] : AIC=813.098, Time=0.08 sec ARIMA(1,2,2)(0,0,1)[3] : AIC=inf, Time=0.32 sec ARIMA(2,2,1)(0,0,1)[3] intercept : AIC=795.267, Time=0.25 sec Best model: ARIMA(2,2,1)(0,0,1)[3] Total fit time: 8.789 seconds
#seasonality 12
darts_aarima_12 = AutoARIMA(m=12, #seasonality
seasonal = True,
test= "adf",
d=None,
start_p = 0 , start_q = 0,
max_p = 12 , max_q = 12,
D=None,
start_P = 0 , start_Q = 0,
trace = True,
error_action = "ignore",
suppress_warnings = True,
stepwise = True
)
darts_aarima_12.fit(train)
predictions_darts_aarima_12 = darts_aarima_12.predict(len(val))
Performing stepwise search to minimize aic ARIMA(0,2,0)(0,0,0)[12] : AIC=829.693, Time=0.02 sec ARIMA(1,2,0)(1,0,0)[12] : AIC=802.832, Time=0.10 sec ARIMA(0,2,1)(0,0,1)[12] : AIC=inf, Time=0.42 sec ARIMA(1,2,0)(0,0,0)[12] : AIC=814.396, Time=0.01 sec ARIMA(1,2,0)(2,0,0)[12] : AIC=804.832, Time=0.13 sec ARIMA(1,2,0)(1,0,1)[12] : AIC=804.832, Time=0.10 sec ARIMA(1,2,0)(0,0,1)[12] : AIC=806.502, Time=0.06 sec ARIMA(1,2,0)(2,0,1)[12] : AIC=806.831, Time=0.14 sec ARIMA(0,2,0)(1,0,0)[12] : AIC=810.386, Time=0.22 sec ARIMA(2,2,0)(1,0,0)[12] : AIC=792.117, Time=0.12 sec ARIMA(2,2,0)(0,0,0)[12] : AIC=802.986, Time=0.05 sec ARIMA(2,2,0)(2,0,0)[12] : AIC=794.117, Time=0.22 sec ARIMA(2,2,0)(1,0,1)[12] : AIC=794.117, Time=0.32 sec ARIMA(2,2,0)(0,0,1)[12] : AIC=795.274, Time=0.16 sec ARIMA(2,2,0)(2,0,1)[12] : AIC=796.117, Time=0.23 sec ARIMA(3,2,0)(1,0,0)[12] : AIC=790.173, Time=0.10 sec ARIMA(3,2,0)(0,0,0)[12] : AIC=799.920, Time=0.03 sec ARIMA(3,2,0)(2,0,0)[12] : AIC=792.062, Time=0.20 sec ARIMA(3,2,0)(1,0,1)[12] : AIC=792.064, Time=0.13 sec ARIMA(3,2,0)(0,0,1)[12] : AIC=792.224, Time=0.08 sec ARIMA(3,2,0)(2,0,1)[12] : AIC=794.062, Time=0.37 sec ARIMA(4,2,0)(1,0,0)[12] : AIC=792.292, Time=0.08 sec ARIMA(3,2,1)(1,0,0)[12] : AIC=790.457, Time=0.41 sec ARIMA(2,2,1)(1,0,0)[12] : AIC=788.567, Time=0.30 sec ARIMA(2,2,1)(0,0,0)[12] : AIC=793.161, Time=0.14 sec ARIMA(2,2,1)(2,0,0)[12] : AIC=790.537, Time=0.60 sec ARIMA(2,2,1)(1,0,1)[12] : AIC=790.538, Time=0.50 sec ARIMA(2,2,1)(0,0,1)[12] : AIC=789.649, Time=0.27 sec ARIMA(2,2,1)(2,0,1)[12] : AIC=792.537, Time=1.25 sec ARIMA(1,2,1)(1,0,0)[12] : AIC=inf, Time=0.53 sec ARIMA(2,2,2)(1,0,0)[12] : AIC=787.955, Time=0.42 sec ARIMA(2,2,2)(0,0,0)[12] : AIC=794.091, Time=0.26 sec ARIMA(2,2,2)(2,0,0)[12] : AIC=789.921, Time=0.94 sec ARIMA(2,2,2)(1,0,1)[12] : AIC=789.921, Time=0.60 sec ARIMA(2,2,2)(0,0,1)[12] : AIC=789.348, Time=0.39 sec ARIMA(2,2,2)(2,0,1)[12] : AIC=791.921, Time=1.47 sec ARIMA(1,2,2)(1,0,0)[12] : AIC=788.686, Time=0.40 sec ARIMA(3,2,2)(1,0,0)[12] : AIC=788.384, Time=1.71 sec ARIMA(2,2,3)(1,0,0)[12] : AIC=786.270, Time=0.70 sec ARIMA(2,2,3)(0,0,0)[12] : AIC=inf, Time=0.49 sec ARIMA(2,2,3)(2,0,0)[12] : AIC=788.214, Time=1.02 sec ARIMA(2,2,3)(1,0,1)[12] : AIC=788.214, Time=0.72 sec ARIMA(2,2,3)(0,0,1)[12] : AIC=788.253, Time=0.80 sec ARIMA(2,2,3)(2,0,1)[12] : AIC=790.214, Time=2.07 sec ARIMA(1,2,3)(1,0,0)[12] : AIC=788.261, Time=0.84 sec ARIMA(3,2,3)(1,0,0)[12] : AIC=inf, Time=1.39 sec ARIMA(2,2,4)(1,0,0)[12] : AIC=788.555, Time=1.65 sec ARIMA(1,2,4)(1,0,0)[12] : AIC=791.033, Time=1.11 sec ARIMA(3,2,4)(1,0,0)[12] : AIC=inf, Time=1.80 sec ARIMA(2,2,3)(1,0,0)[12] intercept : AIC=788.229, Time=0.70 sec Best model: ARIMA(2,2,3)(1,0,0)[12] Total fit time: 26.935 seconds
val.plot(label="actual test")
predictions_ses_test.plot(label="prediction SES")
prediction_theta.plot(label="prediction Theta")
predictions_darts_aarima.plot(label="prediction ARIMA(2,2,1)(0,0,1)[3] (darts auto_arima)")
predictions_darts_aarima_12.plot(label="prediction ARIMA(2,2,3)(1,0,0)[12] (darts auto_arima)")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x27866b3d840>
#test ARIMA(2,2,1)(0,0,1)[3]
aarima_r2score_test = r2_score(predictions_darts_aarima,series)
aarima_mae_test = mae(predictions_darts_aarima,series)
aarima_mape_test = mape(predictions_darts_aarima,series)
aarima_rmse_test = rmse(predictions_darts_aarima,series)
#test ARIMA(2,2,3)(1,0,0)[12]
aarima_r2score_test_12 = r2_score(predictions_darts_aarima_12,series)
aarima_mae_test_12 = mae(predictions_darts_aarima_12,series)
aarima_mape_test_12 = mape(predictions_darts_aarima_12,series)
aarima_rmse_test_12 = rmse(predictions_darts_aarima_12,series)
evaluation_darts_aarima_model = {
"Models":["ARIMA(2,2,1)(0,0,1)[3]","ARIMA(2,2,3)(1,0,0)[12]"],
"R2_score_test":[aarima_r2score_test,aarima_r2score_test_12],
"MAE_test":[aarima_mae_test,aarima_mae_test_12],
"RMSE_test":[aarima_rmse_test,aarima_rmse_test_12],
"MAPE_test":[aarima_mape_test,aarima_mape_test_12],
}
evaluation_darts_aarima_model = pd.DataFrame(evaluation_darts_aarima_model).set_index("Models")
evaluation_darts_aarima_model
| R2_score_test | MAE_test | RMSE_test | MAPE_test | |
|---|---|---|---|---|
| Models | ||||
| ARIMA(2,2,1)(0,0,1)[3] | -9.689964 | 37326.878565 | 41393.297301 | 39.246918 |
| ARIMA(2,2,3)(1,0,0)[12] | -1.275564 | 24030.764008 | 26837.410485 | 30.431073 |
from darts.models import LinearRegressionModel
#lag = 12
model_lr = LinearRegressionModel(lags=12,
output_chunk_length=1,
)
model_lr.fit(train)
prediction_lr = model_lr.predict(len(val))
#lag = 24
model_lr_24 = LinearRegressionModel(lags=24,
output_chunk_length=1,
)
model_lr_24.fit(train)
prediction_lr_24 = model_lr_24.predict(len(val))
val.plot(label="actual test")
predictions_ses_test.plot(label="prediction SES")
prediction_lr.plot(label="prediction linear regression lag = 12")
prediction_lr_24.plot(label="prediction linear regression lag = 24")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x27866c5bc70>
#test lag = 12
lr_r2score_test = r2_score(prediction_lr,series)
lr_mae_test = mae(prediction_lr,series)
lr_mape_test = mape(prediction_lr,series)
lr_rmse_test = rmse(prediction_lr,series)
#test lag = 24
lr_r2score_test_24 = r2_score(prediction_lr_24,series)
lr_mae_test_24 = mae(prediction_lr_24,series)
lr_mape_test_24 = mape(prediction_lr_24,series)
lr_rmse_test_24 = rmse(prediction_lr_24,series)
evaluation_lr_model = {
"Models":["LR,lag=12","LR,lag=24"],
"R2_score_test":[lr_r2score_test, lr_r2score_test_24],
"MAE_test":[lr_mae_test, lr_mae_test_24],
"RMSE_test":[lr_rmse_test, lr_rmse_test_24],
"MAPE_test":[lr_mape_test, lr_mape_test_24],
}
evaluation_lr_model = pd.DataFrame(evaluation_lr_model).set_index("Models")
evaluation_lr_model
| R2_score_test | MAE_test | RMSE_test | MAPE_test | |
|---|---|---|---|---|
| Models | ||||
| LR,lag=12 | -0.228606 | 41703.699097 | 48233.488788 | 39.084772 |
| LR,lag=24 | 0.261670 | 15091.531078 | 18985.499836 | 31.905839 |
in this linear regression model we used output_chunk_length = 1 which means we used the last 12 or 24 months to predict the next month but what would happen if we use output_chunk_length = 12 , wich means we used the last 12 or 24 months to predict the next 12 months
#lag = 12
model_lr_v2 = LinearRegressionModel(lags=12,
output_chunk_length=12,
)
model_lr_v2.fit(train)
prediction_lr_v2 = model_lr_v2.predict(len(val))
#lag = 24
model_lr_24_v2 = LinearRegressionModel(lags=24,
output_chunk_length=12,
)
model_lr_24_v2.fit(train)
prediction_lr_24_v2 = model_lr_24_v2.predict(len(val))
#lag = 3
model_lr_3_v2 = LinearRegressionModel(lags=3,
output_chunk_length=12,
)
model_lr_3_v2.fit(train)
prediction_lr_3_v2 = model_lr_3_v2.predict(len(val))
val.plot(label="actual test")
predictions_ses_test.plot(label="prediction SES")
prediction_lr.plot(label="prediction linear regression lag = 12")
prediction_lr_v2.plot(label="prediction linear regression lag = 12 version 2")
prediction_lr_3_v2.plot(label="prediction linear regression lag = 3 ")
prediction_lr_24.plot(label="prediction linear regression lag = 24")
prediction_lr_24_v2.plot(label="prediction linear regression lag = 24 version2")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x2786b3bffd0>
val.plot(label="actual test")
predictions_ses_test.plot(label="prediction SES")
prediction_lr.plot(label="prediction linear regression lag = 12")
prediction_lr_24.plot(label="prediction linear regression lag = 24")
prediction_lr_3_v2.plot(label="prediction linear regression lag = 3 ")
prediction_lr_24_v2.plot(label="prediction linear regression lag = 24 version2")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x2786b3f4280>
#test lag = 12
lr_r2score_test_v2 = r2_score(prediction_lr_v2,series)
lr_mae_test_v2 = mae(prediction_lr_v2,series)
lr_mape_test_v2 = mape(prediction_lr_v2,series)
lr_rmse_test_v2 = rmse(prediction_lr_v2,series)
#test lag = 24
lr_r2score_test_24_v2 = r2_score(prediction_lr_24_v2,series)
lr_mae_test_24_v2 = mae(prediction_lr_24_v2,series)
lr_mape_test_24_v2 = mape(prediction_lr_24_v2,series)
lr_rmse_test_24_v2 = rmse(prediction_lr_24_v2,series)
#test lag = 24
lr_r2score_test_3_v2 = r2_score(prediction_lr_3_v2,series)
lr_mae_test_3_v2 = mae(prediction_lr_3_v2,series)
lr_mape_test_3_v2 = mape(prediction_lr_3_v2,series)
lr_rmse_test_3_v2 = rmse(prediction_lr_3_v2,series)
evaluation_lr_model_v2 = {
"Models":["LR,lag=12,v2","LR,lag=24,v2","lR,lag=3,v2"],
"R2_score_test":[lr_r2score_test_v2, lr_r2score_test_24_v2,lr_r2score_test_3_v2],
"MAE_test":[lr_mae_test_v2, lr_mae_test_24_v2,lr_mae_test_3_v2],
"RMSE_test":[lr_rmse_test_v2, lr_rmse_test_24_v2,lr_rmse_test_3_v2],
"MAPE_test":[lr_mape_test_v2, lr_mape_test_24_v2,lr_mape_test_3_v2],
}
evaluation_lr_model_v2 = pd.DataFrame(evaluation_lr_model_v2).set_index("Models")
evaluation_lr_model_v2
| R2_score_test | MAE_test | RMSE_test | MAPE_test | |
|---|---|---|---|---|
| Models | ||||
| LR,lag=12,v2 | -0.716584 | 257510.193941 | 286980.706372 | 120.819638 |
| LR,lag=24,v2 | 0.436841 | 13853.908533 | 17114.254366 | 34.848904 |
| lR,lag=3,v2 | 0.023775 | 21302.025956 | 23719.791011 | 59.636532 |
ml_data = general_data.copy()
ml_data.head()
| Sales | |
|---|---|
| 2011-01-31 | 13946.229 |
| 2011-02-28 | 4810.558 |
| 2011-03-31 | 55691.009 |
| 2011-04-30 | 28295.345 |
| 2011-05-31 | 23648.287 |
ml_data["quarter"] = ml_data.index.quarter
ml_data["month"] = ml_data.index.month
ml_data["year"] = ml_data.index.year
Previously, using the pacf and acf plots, we observed that the first lag (in both acf and pacf) had a significant correlation, so we're going to add the previous month's sales to our data as a feature.
ml_data["previous_month"] = ml_data["Sales"].shift(+1)
#our result :
ml_data.head()
| Sales | quarter | month | year | previous_month | |
|---|---|---|---|---|---|
| 2011-01-31 | 13946.229 | 1 | 1 | 2011 | NaN |
| 2011-02-28 | 4810.558 | 1 | 2 | 2011 | 13946.229 |
| 2011-03-31 | 55691.009 | 1 | 3 | 2011 | 4810.558 |
| 2011-04-30 | 28295.345 | 2 | 4 | 2011 | 55691.009 |
| 2011-05-31 | 23648.287 | 2 | 5 | 2011 | 28295.345 |
We got a null value in the first row of the "previous_month" column. We're going to use the average of sales in December to fill this value.
ml_data.loc[ml_data.index.month == 12 ]
| Sales | quarter | month | year | previous_month | |
|---|---|---|---|---|---|
| 2011-12-31 | 69545.6205 | 4 | 12 | 2011 | 78628.7167 |
| 2012-12-31 | 74919.5212 | 4 | 12 | 2012 | 75972.5635 |
| 2013-12-31 | 97237.4170 | 4 | 12 | 2013 | 82192.3228 |
| 2014-12-31 | 90474.6008 | 4 | 12 | 2014 | 112326.4710 |
#calculate the average
average_december = ml_data.loc[ml_data.index.month == 12 ]["Sales"].mean()
average_december
83044.289875
#filling the null value
ml_data.isna().sum()
Sales 0 quarter 0 month 0 year 0 previous_month 1 dtype: int64
ml_data.fillna(average_december,inplace=True)
ml_data.head()
| Sales | quarter | month | year | previous_month | |
|---|---|---|---|---|---|
| 2011-01-31 | 13946.229 | 1 | 1 | 2011 | 83044.289875 |
| 2011-02-28 | 4810.558 | 1 | 2 | 2011 | 13946.229000 |
| 2011-03-31 | 55691.009 | 1 | 3 | 2011 | 4810.558000 |
| 2011-04-30 | 28295.345 | 2 | 4 | 2011 | 55691.009000 |
| 2011-05-31 | 23648.287 | 2 | 5 | 2011 | 28295.345000 |
#### without previous month
ml_data2 = ml_data.drop("previous_month",axis=1)
ml_data2.head()
| Sales | quarter | month | year | |
|---|---|---|---|---|
| 2011-01-31 | 13946.229 | 1 | 1 | 2011 |
| 2011-02-28 | 4810.558 | 1 | 2 | 2011 |
| 2011-03-31 | 55691.009 | 1 | 3 | 2011 |
| 2011-04-30 | 28295.345 | 2 | 4 | 2011 |
| 2011-05-31 | 23648.287 | 2 | 5 | 2011 |
ml_data_train = ml_data.iloc[:int(ml_data.shape[0]*0.75)]
ml_data_test = ml_data.iloc[int(ml_data.shape[0]*0.75):]
print(f"train size {ml_data_train.shape[0]}")
print(f"test size {ml_data_test.shape[0]}")
train size 36 test size 12
#xtrain,y_train
x_train = ml_data_train.drop("Sales",axis=1)
y_train = ml_data_train["Sales"]
#xtest,y_test
x_test = ml_data_test.drop("Sales",axis=1)
y_test = ml_data_test["Sales"]
print(f"x_train's shape: {x_train.shape}")
print(f"x_test's shape: {x_test.shape}")
x_train's shape: (36, 4) x_test's shape: (12, 4)
#### without previous
ml_data_train2 = ml_data2.iloc[:int(ml_data2.shape[0]*0.75)]
ml_data_test2 = ml_data2.iloc[int(ml_data2.shape[0]*0.75):]
print(f"train size {ml_data_train2.shape[0]}")
print(f"test size {ml_data_test2.shape[0]}")
train size 36 test size 12
#xtrain,y_train
x_train2 = ml_data_train2.drop("Sales",axis=1)
y_train2 = ml_data_train2["Sales"]
#xtest,y_test
x_test2 = ml_data_test2.drop("Sales",axis=1)
y_test2 = ml_data_test2["Sales"]
import xgboost as xgb
xgb_model = xgb.XGBRegressor(n_estimators=1000000 ,
early_stopping_rounds = 4000,
learning_rate=0.001)
xgb_model.fit(x_train,y_train,
eval_set=[(x_train,y_train),(x_test,y_test)],
verbose= 10000)
[0] validation_0-rmse:49111.40360 validation_1-rmse:66106.44463 [10000] validation_0-rmse:223.57997 validation_1-rmse:18423.13776 [12966] validation_0-rmse:60.39003 validation_1-rmse:18432.51329
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=4000,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.001, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=1000000, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)
print("Features imortance:")
pd.DataFrame(data = xgb_model.feature_importances_,
index = xgb_model.feature_names_in_,
columns=["importance"])
Features imortance:
| importance | |
|---|---|
| quarter | 0.079890 |
| month | 0.823684 |
| year | 0.045479 |
| previous_month | 0.050947 |
ml_data_test["prediction_full_features"] = xgb_model.predict(x_test)
plt.plot(ml_data_test["Sales"],label="Actual data")
plt.plot(ml_data_test["prediction_full_features"],label="predcition with all features")
plt.legend()
<matplotlib.legend.Legend at 0x27867347a90>
We observed that the first 5 predictions were very close. so rolling forecasts may return better forecasts.
train_features = ['quarter', 'month', 'year', 'previous_month']
test_features ='Sales'
#### rolling foreacast
test_set_range= list(data_test.index)
#test_set_range.insert(0,data_train.index[-1])
#predictions
y_pred_xgb = []
xgb_model_roll = xgb.XGBRegressor(n_estimators=1000000 ,
early_stopping_rounds = 4000,
learning_rate=0.001)
ml_data_train_roll = ml_data_train.copy()
prev_sale = ml_data_test.loc[ml_data_test.index[0]][test_features]
for i in tqdm(range(x_test.shape[0])):
x_train_roll, y_train_roll = ml_data_train_roll.drop("Sales",axis=1),ml_data_train_roll["Sales"]
#x_train_roll , y_train_roll = ml_data.loc[:stop_date][train_features], ml_data.loc[:stop_date][test_features]
x_test_roll = ml_data_test.loc[ml_data_test.index[i]][train_features].to_frame().transpose()
if(i!=0):
x_test_roll["previous_month"] = prev_sale
#retrain the model
xgb_model_roll.fit(x_train_roll,y_train_roll,
eval_set=[(x_train_roll,y_train_roll)],
verbose = False)
pred = xgb_model_roll.predict(x_test_roll)
y_pred_xgb.append(pred[0])
#insert the predicted row with in the training data
x_test_roll["Sales"] = ml_data_test.loc[ml_data_test.index[i]][test_features]
ml_data_train_roll = pd.concat([ml_data_train_roll,x_test_roll],axis=0)
prev_sale = pred
100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [06:09<00:00, 30.83s/it]
ml_data_test["y_pred_xgb_rolling"]=y_pred_xgb
plt.plot(ml_data_test["Sales"],label="Actual data")
plt.plot(ml_data_test["prediction_full_features"],label="predcition with all features")
plt.plot(ml_data_test["y_pred_xgb_rolling"],label="predcition rolling with all features")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x2786c3e9f90>
print("Features imortance:")
pd.DataFrame(data = xgb_model_roll.feature_importances_,
index = xgb_model_roll.feature_names_in_,
columns=["importance"])
Features imortance:
| importance | |
|---|---|
| quarter | 0.021787 |
| month | 0.839823 |
| year | 0.086483 |
| previous_month | 0.051906 |
#####without previous month
train_features = ['quarter', 'month', 'year']
ml_data_train[train_features].head()
| quarter | month | year | |
|---|---|---|---|
| 2011-01-31 | 1 | 1 | 2011 |
| 2011-02-28 | 1 | 2 | 2011 |
| 2011-03-31 | 1 | 3 | 2011 |
| 2011-04-30 | 2 | 4 | 2011 |
| 2011-05-31 | 2 | 5 | 2011 |
train_features = ['quarter', 'month', 'year']
test_features ='Sales'
#### rolling foreacast
test_set_range= list(data_test.index)
#test_set_range.insert(0,data_train.index[-1])
#predictions
y_pred_xgb_2 = []
xgb_model_roll = xgb.XGBRegressor(n_estimators=1000000 ,
early_stopping_rounds = 4000,
learning_rate=0.001)
ml_data_train_roll = ml_data_train2.copy()
#prev_sale = ml_data_test.loc[ml_data_test.index[0]][test_features]
for i in tqdm(range(x_test.shape[0])):
x_train_roll, y_train_roll = ml_data_train_roll.drop("Sales",axis=1),ml_data_train_roll["Sales"]
#x_train_roll , y_train_roll = ml_data.loc[:stop_date][train_features], ml_data.loc[:stop_date][test_features]
x_test_roll = ml_data_test.loc[ml_data_test.index[i]][train_features].to_frame().transpose()
#if(i!=0):
# x_test_roll["previous_month"] = prev_sale
#retrain the model
xgb_model_roll.fit(x_train_roll,y_train_roll,
eval_set=[(x_train_roll,y_train_roll)],
verbose = False)
pred = xgb_model_roll.predict(x_test_roll)
y_pred_xgb_2.append(pred[0])
#insert the predicted row with in the training data
x_test_roll["Sales"] = ml_data_test.loc[ml_data_test.index[i]][test_features]
ml_data_train_roll = pd.concat([ml_data_train_roll,x_test_roll],axis=0)
#prev_sale = pred
100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [06:31<00:00, 32.60s/it]
ml_data_test["y_pred_xgb_rolling_2"]=y_pred_xgb_2
plt.plot(ml_data_test["Sales"],label="Actual data")
plt.plot(ml_data_test["y_pred_xgb_rolling_2"],label="y_pred_xgb_rolling without previous (feature)")
#plt.plot(ml_data_test["y_pred_xgb_rolling"],label="predcition rolling with all features")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x2786c47c790>
print("Features imortance 2:")
pd.DataFrame(data = xgb_model_roll.feature_importances_,
index = xgb_model_roll.feature_names_in_,
columns=["importance"])
Features imortance 2:
| importance | |
|---|---|
| quarter | 0.148078 |
| month | 0.770633 |
| year | 0.081289 |
based on the most important features , we'll drop the quarter feature
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error,r2_score ,mean_squared_error
#test lag = 12
xgb_r2score_test = r2_score(ml_data_test["Sales"], ml_data_test["prediction_full_features"])
xgb_mae_test = mean_absolute_error(ml_data_test["Sales"], ml_data_test["prediction_full_features"])
xgb_mape_test = mean_absolute_percentage_error(ml_data_test["Sales"], ml_data_test["prediction_full_features"])
xgb_rmse_test = mean_squared_error(ml_data_test["Sales"], ml_data_test["prediction_full_features"])
#test lag = 24
xgb_r2score_test_rolling = r2_score(ml_data_test["Sales"], ml_data_test["y_pred_xgb_rolling"])
xgb_mae_test_rolling = mean_absolute_error(ml_data_test["Sales"], ml_data_test["y_pred_xgb_rolling"])
xgb_mape_test_rolling = mean_absolute_percentage_error(ml_data_test["Sales"], ml_data_test["y_pred_xgb_rolling"])
xgb_rmse_test_rolling = mean_squared_error(ml_data_test["Sales"], ml_data_test["y_pred_xgb_rolling"])
#without previous
xgb_r2score_test_rolling_2 = r2_score(ml_data_test["Sales"], ml_data_test["y_pred_xgb_rolling_2"])
xgb_mae_test_rolling_2 = mean_absolute_error(ml_data_test["Sales"], ml_data_test["y_pred_xgb_rolling_2"])
xgb_mape_test_rolling_2 = mean_absolute_percentage_error(ml_data_test["Sales"], ml_data_test["y_pred_xgb_rolling_2"])
xgb_rmse_test_rolling_2 = mean_squared_error(ml_data_test["Sales"], ml_data_test["y_pred_xgb_rolling_2"])
evaluation_xgb_model = {
"Models":["xgb normal","xgb rolling","xgb rolling 2"],
"R2_score_test":[xgb_r2score_test, xgb_r2score_test_rolling,xgb_r2score_test_rolling_2],
"MAE_test":[xgb_mae_test, xgb_mae_test_rolling,xgb_mae_test_rolling_2],
"RMSE_test":[xgb_rmse_test, xgb_rmse_test_rolling,xgb_rmse_test_rolling_2],
"MAPE_test":[xgb_mape_test, xgb_mape_test_rolling,xgb_mape_test_rolling_2],
}
evaluation_xgb_model = pd.DataFrame(evaluation_xgb_model).set_index("Models")
evaluation_xgb_model
| R2_score_test | MAE_test | RMSE_test | MAPE_test | |
|---|---|---|---|---|
| Models | ||||
| xgb normal | 0.466453 | 14006.995454 | 3.388316e+08 | 0.213764 |
| xgb rolling | 0.610246 | 12704.546376 | 2.475156e+08 | 0.205903 |
| xgb rolling 2 | 0.525637 | 14091.497517 | 3.012469e+08 | 0.286398 |
from sklearn.ensemble import RandomForestRegressor
rfr_model = RandomForestRegressor(n_estimators=10000)
rfr_model.fit(x_train,y_train)
RandomForestRegressor(n_estimators=10000)
#### forecast on testset
ml_data_test["y_pred_rfr_regressor"] = rfr_model.predict(x_test)
plt.plot(ml_data_test["Sales"],label="Actual data")
plt.plot(ml_data_test["prediction_full_features"],label="predcition with all features")
plt.plot(ml_data_test["y_pred_xgb_rolling"],label="predcition rolling with all features")
plt.plot(ml_data_test["y_pred_rfr_regressor"],label="y_pred_rfr_regressor with all features")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x278075677c0>
train_features = ['quarter', 'month', 'year', 'previous_month']
test_features ='Sales'
#### rolling foreacast
test_set_range= list(data_test.index)
#test_set_range.insert(0,data_train.index[-1])
#predictions
y_pred_rfr = []
rfr_model = RandomForestRegressor(n_estimators=10000)
ml_data_train_roll = ml_data_train.copy()
prev_sale = ml_data_test.loc[ml_data_test.index[0]][test_features]
for i in tqdm(range(x_test.shape[0])):
x_train_roll, y_train_roll = ml_data_train_roll.drop("Sales",axis=1),ml_data_train_roll["Sales"]
#x_train_roll , y_train_roll = ml_data.loc[:stop_date][train_features], ml_data.loc[:stop_date][test_features]
x_test_roll = ml_data_test.loc[ml_data_test.index[i]][train_features].to_frame().transpose()
if(i!=0):
x_test_roll["previous_month"] = prev_sale
#retrain the model
rfr_model.fit(x_train_roll,y_train_roll)
pred = rfr_model.predict(x_test_roll)
y_pred_rfr.append(pred[0])
#insert the predicted row with in the training data
x_test_roll["Sales"] = ml_data_test.loc[ml_data_test.index[i]][test_features]
ml_data_train_roll = pd.concat([ml_data_train_roll,x_test_roll],axis=0)
prev_sale = pred
100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [04:55<00:00, 24.62s/it]
ml_data_test["y_pred_rfr_rolling"]=y_pred_rfr
plt.plot(ml_data_test["Sales"],label="Actual data")
plt.plot(ml_data_test["y_pred_rfr_rolling"],label="y_pred_rfr_rolling with all features")
#plt.plot(ml_data_test["y_pred_xgb_rolling"],label="predcition rolling with all features")
plt.plot(ml_data_test["y_pred_rfr_regressor"],label="y_pred_rfr_regressor with all features")
#plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x2780e8c5480>
#test lag = 12
rfr_r2score_test = r2_score(ml_data_test["Sales"], ml_data_test["y_pred_rfr_regressor"])
rfr_mae_test = mean_absolute_error(ml_data_test["Sales"], ml_data_test["y_pred_rfr_regressor"])
rfr_mape_test = mean_absolute_percentage_error(ml_data_test["Sales"], ml_data_test["y_pred_rfr_regressor"])
rfr_rmse_test = mean_squared_error(ml_data_test["Sales"], ml_data_test["y_pred_rfr_regressor"])
#test lag = 24
rfr_r2score_test_rolling = r2_score(ml_data_test["Sales"], ml_data_test["y_pred_rfr_rolling"])
rfr_mae_test_rolling = mean_absolute_error(ml_data_test["Sales"], ml_data_test["y_pred_rfr_rolling"])
rfr_mape_test_rolling = mean_absolute_percentage_error(ml_data_test["Sales"], ml_data_test["y_pred_rfr_rolling"])
rfr_rmse_test_rolling = mean_squared_error(ml_data_test["Sales"], ml_data_test["y_pred_rfr_rolling"])
evaluation_rfr_model = {
"Models":["rfr normal","rfr rolling"],
"R2_score_test":[rfr_r2score_test, rfr_r2score_test_rolling],
"MAE_test":[rfr_mae_test, rfr_mae_test_rolling],
"RMSE_test":[rfr_rmse_test, rfr_rmse_test_rolling],
"MAPE_test":[rfr_mape_test, rfr_mape_test_rolling],
}
evaluation_rfr_model = pd.DataFrame(evaluation_rfr_model).set_index("Models")
evaluation_rfr_model
| R2_score_test | MAE_test | RMSE_test | MAPE_test | |
|---|---|---|---|---|
| Models | ||||
| rfr normal | 0.392189 | 14750.156216 | 3.859934e+08 | 0.214972 |
| rfr rolling | 0.514341 | 13368.272388 | 3.084201e+08 | 0.220105 |
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(x_train,y_train)
LinearRegression()
ml_data_test["y_pred_lr"] = lr_model.predict(x_test)
plt.plot(ml_data_test["Sales"],label="Actual data")
plt.plot(ml_data_test["prediction_full_features"],label="predcition with all features")
plt.plot(ml_data_test["y_pred_xgb_rolling"],label="predcition rolling with all features")
plt.plot(ml_data_test["y_pred_lr"],label="y_pred linear regression with all features")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x2780a77bca0>
train_features = ['quarter', 'month', 'year', 'previous_month']
test_features ='Sales'
#### rolling foreacast
test_set_range= list(data_test.index)
#test_set_range.insert(0,data_train.index[-1])
#predictions
y_pred_lr = []
lr_model = LinearRegression()
ml_data_train_roll = ml_data_train.copy()
prev_sale = ml_data_test.loc[ml_data_test.index[0]][test_features]
for i in tqdm(range(x_test.shape[0])):
x_train_roll, y_train_roll = ml_data_train_roll.drop("Sales",axis=1),ml_data_train_roll["Sales"]
#x_train_roll , y_train_roll = ml_data.loc[:stop_date][train_features], ml_data.loc[:stop_date][test_features]
x_test_roll = ml_data_test.loc[ml_data_test.index[i]][train_features].to_frame().transpose()
if(i!=0):
x_test_roll["previous_month"] = prev_sale
#retrain the model
lr_model.fit(x_train_roll,y_train_roll)
pred = lr_model.predict(x_test_roll)
y_pred_lr.append(pred[0])
#insert the predicted row with in the training data
x_test_roll["Sales"] = ml_data_test.loc[ml_data_test.index[i]][test_features]
ml_data_train_roll = pd.concat([ml_data_train_roll,x_test_roll],axis=0)
prev_sale = pred
100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 87.80it/s]
ml_data_test["y_pred_lr_rolling"]=y_pred_lr
plt.plot(ml_data_test["Sales"],label="Actual data")
plt.plot(ml_data_test["y_pred_lr_rolling"],label="y_pred lr rolling with all features")
plt.plot(ml_data_test["y_pred_lr"],label="y_pred lr with all features")
#plt.plot(ml_data_test["y_pred_xgb_rolling"],label="predcition rolling with all features")
plt.plot(ml_data_test["y_pred_rfr_regressor"],label="y_pred_rfr_regressor with all features")
#plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x2780a865720>
#test lag = 12
lr_r2score_test = r2_score(ml_data_test["Sales"], ml_data_test["y_pred_lr"])
lr_mae_test = mean_absolute_error(ml_data_test["Sales"], ml_data_test["y_pred_lr"])
lr_mape_test = mean_absolute_percentage_error(ml_data_test["Sales"], ml_data_test["y_pred_lr"])
lr_rmse_test = mean_squared_error(ml_data_test["Sales"], ml_data_test["y_pred_lr"])
#test lag = 24
lr_r2score_test_rolling = r2_score(ml_data_test["Sales"], ml_data_test["y_pred_lr_rolling"])
lr_mae_test_rolling = mean_absolute_error(ml_data_test["Sales"], ml_data_test["y_pred_lr_rolling"])
lr_mape_test_rolling = mean_absolute_percentage_error(ml_data_test["Sales"], ml_data_test["y_pred_lr_rolling"])
lr_rmse_test_rolling = mean_squared_error(ml_data_test["Sales"], ml_data_test["y_pred_lr_rolling"])
evaluation_lr_model = {
"Models":["rfr normal","rfr rolling"],
"R2_score_test":[lr_r2score_test, lr_r2score_test_rolling],
"MAE_test":[lr_mae_test, lr_mae_test_rolling],
"RMSE_test":[lr_rmse_test, lr_rmse_test_rolling],
"MAPE_test":[lr_mape_test, lr_mape_test_rolling],
}
evaluation_lr_model = pd.DataFrame(evaluation_lr_model).set_index("Models")
evaluation_lr_model
| R2_score_test | MAE_test | RMSE_test | MAPE_test | |
|---|---|---|---|---|
| Models | ||||
| rfr normal | 0.615397 | 11547.233942 | 2.442441e+08 | 0.215073 |
| rfr rolling | 0.637116 | 11613.265428 | 2.304515e+08 | 0.219678 |
from sklearn.neighbors import KNeighborsRegressor
knn_model = KNeighborsRegressor(n_neighbors=3)
knn_model.fit(x_train,y_train )
KNeighborsRegressor(n_neighbors=3)
ml_data_test["y_pred_knn_regressor"] = knn_model.predict(x_test)
plt.plot(ml_data_test["Sales"],label="Actual data")
plt.plot(ml_data_test["y_pred_knn_regressor"],label="predcition knn_regressor with all features")
plt.plot(ml_data_test["y_pred_xgb_rolling"],label="predcition xgb rolling with all features")
plt.plot(ml_data_test["y_pred_rfr_regressor"],label="y_pred_rfr_regressor with all features")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x2780ef1d330>
train_features = ['quarter', 'month', 'year', 'previous_month']
test_features ='Sales'
#### rolling foreacast
test_set_range= list(data_test.index)
#test_set_range.insert(0,data_train.index[-1])
#predictions
y_pred_knn = []
knn_model = KNeighborsRegressor(n_neighbors=3)
ml_data_train_roll = ml_data_train.copy()
prev_sale = ml_data_test.loc[ml_data_test.index[0]][test_features]
for i in tqdm(range(x_test.shape[0])):
x_train_roll, y_train_roll = ml_data_train_roll.drop("Sales",axis=1),ml_data_train_roll["Sales"]
#x_train_roll , y_train_roll = ml_data.loc[:stop_date][train_features], ml_data.loc[:stop_date][test_features]
x_test_roll = ml_data_test.loc[ml_data_test.index[i]][train_features].to_frame().transpose()
if(i!=0):
x_test_roll["previous_month"] = prev_sale
#retrain the model
knn_model.fit(x_train_roll,y_train_roll)
pred = knn_model.predict(x_test_roll)
y_pred_knn.append(pred[0])
#insert the predicted row with in the training data
x_test_roll["Sales"] = ml_data_test.loc[ml_data_test.index[i]][test_features]
ml_data_train_roll = pd.concat([ml_data_train_roll,x_test_roll],axis=0)
prev_sale = pred
100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 77.29it/s]
ml_data_test["y_pred_knn_rolling"]=y_pred_knn
plt.plot(ml_data_test["Sales"],label="Actual data")
plt.plot(ml_data_test["y_pred_knn_rolling"],label="y_pred_rfr_rolling with all features")
plt.plot(ml_data_test["y_pred_knn_regressor"],label="y_pred_rfr_regressor with all features")
plt.legend()
<matplotlib.legend.Legend at 0x2780ea47a30>
#test lag = 12
knn_r2score_test = r2_score(ml_data_test["Sales"],ml_data_test["y_pred_knn_regressor"])
knn_mae_test = mean_absolute_error(ml_data_test["Sales"],ml_data_test["y_pred_knn_regressor"])
knn_mape_test = mean_absolute_percentage_error(ml_data_test["Sales"], ml_data_test["y_pred_knn_regressor"])
knn_rmse_test = mean_squared_error(ml_data_test["Sales"], ml_data_test["y_pred_knn_regressor"])
#test lag = 24
knn_r2score_test_rolling = r2_score(ml_data_test["Sales"],ml_data_test["y_pred_knn_rolling"])
knn_mae_test_rolling = mean_absolute_error(ml_data_test["Sales"], ml_data_test["y_pred_knn_rolling"])
knn_mape_test_rolling = mean_absolute_percentage_error(ml_data_test["Sales"], ml_data_test["y_pred_knn_rolling"])
knn_rmse_test_rolling = mean_squared_error(ml_data_test["Sales"], ml_data_test["y_pred_knn_rolling"])
evaluation_knn_model = {
"Models":["knn normal","knn rolling"],
"R2_score_test":[knn_r2score_test, knn_r2score_test_rolling],
"MAE_test":[knn_mae_test, knn_mae_test_rolling],
"RMSE_test":[knn_rmse_test, knn_rmse_test_rolling],
"MAPE_test":[knn_mape_test, knn_mape_test_rolling],
}
evaluation_knn_model = pd.DataFrame(evaluation_knn_model).set_index("Models")
evaluation_knn_model
| R2_score_test | MAE_test | RMSE_test | MAPE_test | |
|---|---|---|---|---|
| Models | ||||
| knn normal | -0.193145 | 20863.752822 | 7.577127e+08 | 0.338539 |
| knn rolling | -0.614889 | 25096.141961 | 1.025544e+09 | 0.396945 |
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping , ReduceLROnPlateau
from tensorflow.keras.losses import mean_squared_error as mse
from tensorflow.keras.metrics import R2Score , RootMeanSquaredError
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2
import keras
from keras.preprocessing.sequence import TimeseriesGenerator
# define generator
n_input = 12
n_features = 1
generator = TimeseriesGenerator(data_train["Sales"],data_train['Sales'], length=n_input,batch_size=1)
#we are using the last 12 month to predict the next month
keras.utils.set_random_seed(812)
model_lstm = Sequential()
model_lstm.add(InputLayer((n_input,n_features)))
model_lstm.add(Bidirectional(LSTM(200,activation='relu',return_sequences=False, kernel_regularizer=l2(0.001),
recurrent_regularizer=l2(0.001), bias_regularizer=l2(0.001))))
#model_lstm.add(Dropout(0.3))
#model_lstm.add(LSTM(120,activation='relu',return_sequences=False, kernel_regularizer=l2(0.001)))
model_lstm.add(Dense(100,activation='relu', kernel_regularizer=l2(0.001)))
#model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(4,activation='relu', kernel_regularizer=l2(0.001)))
model_lstm.add(Dense(1))
#model_lstm.compile(loss="mean_squared_error",optimizer="adam",metrics=["mape"])
model_lstm.compile(loss="mean_squared_error",optimizer=Adam(0.001),metrics=["mape"])
#model_lstm.compile(loss="mean_squared_error",optimizer=Adam(0.00001),metrics=[R2Score(),RootMeanSquaredError()])
model_lstm.summary()
Model: "sequential_36"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
bidirectional_35 (Bidirect (None, 400) 323200
ional)
dense_83 (Dense) (None, 100) 40100
dense_84 (Dense) (None, 4) 404
dense_85 (Dense) (None, 1) 5
=================================================================
Total params: 363709 (1.39 MB)
Trainable params: 363709 (1.39 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
ES = EarlyStopping(monitor="loss", verbose=1,
patience=500, restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1,
patience=200, min_lr=0.00001)
model_lstm.fit(generator,epochs=2000,callbacks=[ES,reduce_lr])
#last 12 month
last_train_batch = data_train[:12]
last_train_batch = last_train_batch.to_numpy()
#get the generator shape (1,12,1)
current_batch = last_train_batch.reshape((1,n_input,n_features))
current_batch.shape
#y_pred
y_pred_lstm = []
for i in range(len(data_test)):
#prediction
current_pred = model_lstm.predict(current_batch)[0]
#save the prediction
y_pred_lstm.append(current_pred)
#take one step foreward (add the current pred)
current_batch = np.append(current_batch[:,1:,:],[[current_pred]],axis=1)
1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 31ms/step 1/1 [==============================] - 0s 18ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 14ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 20ms/step 1/1 [==============================] - 0s 29ms/step
ml_data_test["y_pred_lstm"] = y_pred_lstm
plt.plot(y_pred_lstm)
plt.plot(data_test.to_numpy(),label="actual train")
plt.legend()
<matplotlib.legend.Legend at 0x279302b1cc0>
mean_absolute_percentage_error(y_test, y_pred_lstm), r2_score(y_test , y_pred_lstm)
(0.369640842426902, 0.05359974508444154)
#last 12 month
last_train_batch = data_train[:12]
last_train_batch = last_train_batch.to_numpy()
#get the generator shape (1,12,1)
current_batch = last_train_batch.reshape((1,n_input,n_features))
current_batch.shape
#y_pred
y_pred_lstm_train = []
for i in range(len(data_train)):
#prediction
current_pred = model_lstm.predict(current_batch)[0]
#save the prediction
y_pred_lstm_train.append(current_pred)
#take one step foreward (add the current pred)
current_batch = np.append(current_batch[:,1:,:],[[current_pred]],axis=1)
1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 30ms/step 1/1 [==============================] - 0s 16ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 17ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 19ms/step 1/1 [==============================] - 0s 28ms/step 1/1 [==============================] - 0s 19ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 14ms/step 1/1 [==============================] - 0s 36ms/step 1/1 [==============================] - 0s 13ms/step 1/1 [==============================] - 0s 31ms/step 1/1 [==============================] - 0s 28ms/step 1/1 [==============================] - 0s 17ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 30ms/step 1/1 [==============================] - 0s 30ms/step 1/1 [==============================] - 0s 29ms/step 1/1 [==============================] - 0s 30ms/step 1/1 [==============================] - 0s 35ms/step 1/1 [==============================] - 0s 33ms/step 1/1 [==============================] - 0s 29ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 25ms/step
ml_data_train["y_pred_lstm_train"] = y_pred_lstm_train
plt.plot(y_pred_lstm_train)
plt.plot(data_train.to_numpy(),label="actual train")
plt.legend()
<matplotlib.legend.Legend at 0x27919ebd750>
mean_absolute_percentage_error(data_train["Sales"], y_pred_lstm_train), r2_score(data_train["Sales"] , y_pred_lstm_train)
(0.32897593375894935, 0.6286672559813002)
#test
lstm_r2score_test = r2_score(y_test,ml_data_test["y_pred_lstm"])
lstm_mae_test = mean_absolute_error(ml_data_test["Sales"] , ml_data_test["y_pred_lstm"])
lstm_mape_test = mean_absolute_percentage_error(ml_data_test["Sales"] , ml_data_test["y_pred_lstm"])
lstm_rmse_test = mean_squared_error(ml_data_test["Sales"] , ml_data_test["y_pred_lstm"])
#train
lstm_r2score_train = r2_score(ml_data_train["Sales"],ml_data_train["y_pred_lstm_train"])
lstm_mae_train = mean_absolute_error(ml_data_train["Sales"],ml_data_train["y_pred_lstm_train"])
lstm_mape_train = mean_absolute_percentage_error(ml_data_train["Sales"],ml_data_train["y_pred_lstm_train"])
lstm_rmse_train = mean_squared_error(ml_data_train["Sales"],ml_data_train["y_pred_lstm_train"])
evaluation_lstm_model = {
"Models":["lstm"],
"R2_score_train":[lstm_r2score_train],
"R2_score_test":[lstm_r2score_test],
"MAE_train":[lstm_mae_train],
"MAE_test":[lstm_mae_test],
"RMSE_train":[lstm_rmse_train],
"RMSE_test":[lstm_rmse_test],
"MAPE_train":[lstm_mape_train],
"MAPE_test":[lstm_mape_test],
}
evaluation_lstm_model = pd.DataFrame(evaluation_lstm_model).set_index("Models")
evaluation_lstm_model
| R2_score_train | R2_score_test | MAE_train | MAE_test | RMSE_train | RMSE_test | MAPE_train | MAPE_test | |
|---|---|---|---|---|---|---|---|---|
| Models | ||||||||
| lstm | 0.628667 | 0.0536 | 11488.084605 | 21950.868795 | 1.970034e+08 | 6.010164e+08 | 0.328976 | 0.369641 |
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
#create new_data
data_train_scaled = data_train.copy()
data_test_scaled = data_test.copy()
scaler.fit(data_train)
MinMaxScaler()
#standarize the data
data_train_scaled = scaler.transform(data_train_scaled)
data_test_scaled = scaler.transform(data_test_scaled)
from keras.preprocessing.sequence import TimeseriesGenerator
# define generator
n_input = 12
n_features = 1
generator_scaled = TimeseriesGenerator(data_train_scaled,data_train_scaled, length=n_input,batch_size=1)
#we are using the last 12 month to predict the next month
keras.utils.set_random_seed(812)
keras.utils.set_random_seed(812)
lstm_model_scaled = Sequential()
lstm_model_scaled.add(InputLayer((n_input,n_features)))
lstm_model_scaled.add(Bidirectional(LSTM(200,activation='relu',return_sequences=False, kernel_regularizer=l2(0.001),
recurrent_regularizer=l2(0.001), bias_regularizer=l2(0.001))))
lstm_model_scaled.add(Dense(100,activation='relu', kernel_regularizer=l2(0.001)))
lstm_model_scaled.add(Dense(4,activation='relu', kernel_regularizer=l2(0.001)))
lstm_model_scaled.add(Dense(1))
lstm_model_scaled.compile(loss="mean_squared_error",optimizer=Adam(0.001),metrics=["mape"])
ES_scaled = EarlyStopping(monitor='loss', verbose=1,
patience=100, restore_best_weights=True
)
reduce_lr_scaled = ReduceLROnPlateau(monitor='loss', factor=0.1,
patience=50, min_lr=0.000001)
lstm_model_scaled.fit(generator_scaled,epochs=1000,callbacks=[ES_scaled,reduce_lr_scaled])
#last 12 month
last_train_batch = data_train_scaled[-12:]
#get the generator shape (1,12,1)
current_batch = last_train_batch.reshape((1,n_input,n_features))
current_batch.shape
#y_pred
y_pred_scaled = []
for i in range(len(data_test_scaled)):
#prediction
current_pred = lstm_model_scaled.predict(current_batch)[0]
#save the prediction
y_pred_scaled.append(current_pred)
#take one step foreward (add the current pred)
current_batch = np.append(current_batch[:,1:,:],[[current_pred]],axis=1)
1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 30ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 36ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 22ms/step
y_pred_reversed = scaler.inverse_transform(y_pred_scaled)
y_test_reversed = scaler.inverse_transform(data_test_scaled)
ml_data_test["y_pred_lstm_scaled"] = y_pred_reversed
plt.plot(y_pred_reversed)
plt.plot(y_test_reversed,label="actual test")
plt.legend()
<matplotlib.legend.Legend at 0x27916c026b0>
mean_absolute_percentage_error(y_test_reversed , y_pred_reversed), r2_score(y_test_reversed , y_pred_reversed)
(0.2136429849241549, 0.7718621109447332)
#last 12 month
last_train_batch = data_train_scaled[:12]
#get the generator shape (1,12,1)
current_batch = last_train_batch.reshape((1,n_input,n_features))
current_batch.shape
#y_pred
y_pred_scaled_train = []
#for i in range(len(data_train_scaled[12:])):
for i in range(len(data_train_scaled)):
#prediction
current_pred = lstm_model_scaled.predict(current_batch)[0]
#save the prediction
y_pred_scaled_train.append(current_pred)
#take one step foreward (add the current pred)
current_batch = np.append(current_batch[:,1:,:],[[current_pred]],axis=1)
1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 29ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 15ms/step 1/1 [==============================] - 0s 33ms/step 1/1 [==============================] - 0s 15ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 32ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 20ms/step 1/1 [==============================] - 0s 19ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 16ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 28ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 29ms/step 1/1 [==============================] - 0s 31ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 31ms/step 1/1 [==============================] - 0s 29ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 28ms/step 1/1 [==============================] - 0s 21ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 27ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 23ms/step 1/1 [==============================] - 0s 26ms/step 1/1 [==============================] - 0s 25ms/step 1/1 [==============================] - 0s 24ms/step 1/1 [==============================] - 0s 22ms/step 1/1 [==============================] - 0s 31ms/step
#pred
y_pred_train_reversed = scaler.inverse_transform(y_pred_scaled_train)
#y_train
y_train_reversed = scaler.inverse_transform(data_train_scaled)
ml_data_train["y_pred_scaled_lstm_train"] = y_pred_train_reversed
plt.plot(y_train_reversed,label="actual train")
plt.plot(y_pred_reversed_train,label = "prediction lstm scaled")
plt.legend()
<matplotlib.legend.Legend at 0x2791909b550>
mean_absolute_percentage_error(y_train_reversed , y_pred_train_reversed), r2_score(y_train_reversed , y_pred_train_reversed)
(0.5382105769020075, 0.5092028271261082)
#test
lstm_r2score_test_scaled = r2_score(y_test,ml_data_test["y_pred_lstm_scaled"])
lstm_mae_test_scaled = mean_absolute_error(ml_data_test["Sales"] , ml_data_test["y_pred_lstm_scaled"])
lstm_mape_test_scaled = mean_absolute_percentage_error(ml_data_test["Sales"] , ml_data_test["y_pred_lstm_scaled"])
lstm_rmse_test_scaled = mean_squared_error(ml_data_test["Sales"] , ml_data_test["y_pred_lstm_scaled"])
#train
lstm_r2score_train_scaled = r2_score(ml_data_train["Sales"],ml_data_train["y_pred_scaled_lstm_train"])
lstm_mae_train_scaled = mean_absolute_error(ml_data_train["Sales"],ml_data_train["y_pred_scaled_lstm_train"])
lstm_mape_train_scaled = mean_absolute_percentage_error(ml_data_train["Sales"],ml_data_train["y_pred_scaled_lstm_train"])
lstm_rmse_train_scaled = mean_squared_error(ml_data_train["Sales"],ml_data_train["y_pred_scaled_lstm_train"])
evaluation_lstm_model_scaled = {
"Models":["lstm scaled"],
"R2_score_train":[lstm_r2score_train_scaled],
"R2_score_test":[lstm_r2score_test_scaled],
"MAE_train":[lstm_mae_train_scaled],
"MAE_test":[lstm_mae_test_scaled],
"RMSE_train":[lstm_rmse_train_scaled],
"RMSE_test":[lstm_rmse_test_scaled],
"MAPE_train":[lstm_mape_train_scaled],
"MAPE_test":[lstm_mape_test_scaled],
}
evaluation_lstm_model_scaled = pd.DataFrame(evaluation_lstm_model_scaled).set_index("Models")
evaluation_lstm_model_scaled
| R2_score_train | R2_score_test | MAE_train | MAE_test | RMSE_train | RMSE_test | MAPE_train | MAPE_test | |
|---|---|---|---|---|---|---|---|---|
| Models | ||||||||
| lstm scaled | 0.509203 | 0.771862 | 13345.963589 | 10306.87348 | 2.603830e+08 | 1.448802e+08 | 0.538211 | 0.213643 |
train = Train()
vs = visualization()
sp = Samples()
prophet_data = general_data.reset_index()
prophet_data.columns = ["ds","y"]
prophet_data.head()
| ds | y | |
|---|---|---|
| 0 | 2011-01-31 | 13946.229 |
| 1 | 2011-02-28 | 4810.558 |
| 2 | 2011-03-31 | 55691.009 |
| 3 | 2011-04-30 | 28295.345 |
| 4 | 2011-05-31 | 23648.287 |
data_train_prophet , data_test_prophet = train.split_data(prophet_data,0.75)
from prophet import Prophet
warnings.filterwarnings('ignore')
prophet_model = Prophet()
prophet_model.fit(data_train_prophet)
13:01:59 - cmdstanpy - INFO - Chain [1] start processing 13:01:59 - cmdstanpy - INFO - Chain [1] done processing
<prophet.forecaster.Prophet at 0x27871d588b0>
y_pred_prophet_test = prophet_model.predict(data_test_prophet)
y_pred_prophet_train = prophet_model.predict(data_train_prophet)
y_pred_prophet_test
| ds | trend | yhat_lower | yhat_upper | trend_lower | trend_upper | additive_terms | additive_terms_lower | additive_terms_upper | yearly | yearly_lower | yearly_upper | multiplicative_terms | multiplicative_terms_lower | multiplicative_terms_upper | yhat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-01-31 | 52503.431802 | 21900.307786 | 34548.042852 | 52491.911921 | 52511.797284 | -24108.199333 | -24108.199333 | -24108.199333 | -24108.199333 | -24108.199333 | -24108.199333 | 0.0 | 0.0 | 0.0 | 28395.232468 |
| 1 | 2014-02-28 | 53089.209144 | 17680.527003 | 30395.651108 | 53054.769260 | 53116.337255 | -29110.609582 | -29110.609582 | -29110.609582 | -29110.609582 | -29110.609582 | -29110.609582 | 0.0 | 0.0 | 0.0 | 23978.599562 |
| 2 | 2014-03-31 | 53737.748345 | 59306.595647 | 71264.207118 | 53673.110468 | 53789.725206 | 11487.422760 | 11487.422760 | 11487.422760 | 11487.422760 | 11487.422760 | 11487.422760 | 0.0 | 0.0 | 0.0 | 65225.171105 |
| 3 | 2014-04-30 | 54365.366926 | 39155.104054 | 51630.813998 | 54265.032318 | 54453.172713 | -8661.713539 | -8661.713539 | -8661.713539 | -8661.713539 | -8661.713539 | -8661.713539 | 0.0 | 0.0 | 0.0 | 45703.653387 |
| 4 | 2014-05-31 | 55013.906127 | 41247.773643 | 54015.233183 | 54868.766714 | 55141.530618 | -7754.571834 | -7754.571834 | -7754.571834 | -7754.571834 | -7754.571834 | -7754.571834 | 0.0 | 0.0 | 0.0 | 47259.334293 |
| 5 | 2014-06-30 | 55641.524708 | 41266.407070 | 54163.561453 | 55448.983794 | 55818.425312 | -7980.308292 | -7980.308292 | -7980.308292 | -7980.308292 | -7980.308292 | -7980.308292 | 0.0 | 0.0 | 0.0 | 47661.216416 |
| 6 | 2014-07-31 | 56290.063909 | 40927.715135 | 53959.851374 | 56044.063847 | 56510.680466 | -8610.095196 | -8610.095196 | -8610.095196 | -8610.095196 | -8610.095196 | -8610.095196 | 0.0 | 0.0 | 0.0 | 47679.968713 |
| 7 | 2014-08-31 | 56938.603110 | 38076.701920 | 50136.468206 | 56637.753984 | 57213.284257 | -13024.994277 | -13024.994277 | -13024.994277 | -13024.994277 | -13024.994277 | -13024.994277 | 0.0 | 0.0 | 0.0 | 43913.608833 |
| 8 | 2014-09-30 | 57566.221691 | 84047.370505 | 96267.050079 | 57214.256214 | 57894.788979 | 32734.986408 | 32734.986408 | 32734.986408 | 32734.986408 | 32734.986408 | 32734.986408 | 0.0 | 0.0 | 0.0 | 90301.208099 |
| 9 | 2014-10-31 | 58214.760892 | 46320.481910 | 59058.996287 | 57802.382415 | 58607.839507 | -5632.071311 | -5632.071311 | -5632.071311 | -5632.071311 | -5632.071311 | -5632.071311 | 0.0 | 0.0 | 0.0 | 52582.689580 |
| 10 | 2014-11-30 | 58842.379473 | 87836.767853 | 100504.198479 | 58357.156612 | 59308.784866 | 34932.047982 | 34932.047982 | 34932.047982 | 34932.047982 | 34932.047982 | 34932.047982 | 0.0 | 0.0 | 0.0 | 93774.427455 |
| 11 | 2014-12-31 | 59490.918674 | 87096.080439 | 99816.163295 | 58934.674052 | 60020.974597 | 33835.652315 | 33835.652315 | 33835.652315 | 33835.652315 | 33835.652315 | 33835.652315 | 0.0 | 0.0 | 0.0 | 93326.570989 |
plt.plot(data_test_prophet.set_index("ds")['y'],label = "actual test")
plt.plot(y_pred_prophet_test.set_index("ds")['yhat'],label = "prophet prediction")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x27871d716c0>
plt.plot(data_train_prophet.set_index("ds")['y'],label = "actual data")
plt.plot(y_pred_prophet_train.set_index("ds")['yhat'],label = "prophet prediction")
plt.plot(results_sarima["Y_prediction_train"][9],label="predicted training data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x278797a4b20>
pd.concat([results_sarima["Y_prediction_train"][9],results_sarima["Y_prediction_train"][9]],axis=0).shape[0]
48
full_data = general_data["Sales"]
full_pred_data_prophet = pd.concat([y_pred_prophet_train,y_pred_prophet_test],axis=0).set_index("ds")['yhat']
full_pred_data_sarima = pd.concat([results_sarima["Y_prediction_train"][9],results_sarima["Y_prediction"][9]],axis=0)
plt.plot(full_data,label = "actual data")
plt.plot(full_pred_data_prophet,label = "prophet prediction")
plt.plot(full_pred_data_sarima,label="SARIMA(1,0,1)(1,1,0,12) prediction")
plt.legend()
<matplotlib.legend.Legend at 0x27879813eb0>
#test
prophet_r2score_test = r2_score(data_test_prophet.set_index("ds")['y'],y_pred_prophet_test.set_index("ds")['yhat'])
prophet_mae_test = mean_absolute_error(data_test_prophet.set_index("ds")['y'] , y_pred_prophet_test.set_index("ds")['yhat'])
prophet_mape_test = mean_absolute_percentage_error(data_test_prophet.set_index("ds")['y'] , y_pred_prophet_test.set_index("ds")['yhat'])
prophet_rmse_test = mean_squared_error(data_test_prophet.set_index("ds")['y'] , y_pred_prophet_test.set_index("ds")['yhat'],squared=False)
#train
prophet_r2score_train = r2_score(data_train_prophet.set_index("ds")['y'], y_pred_prophet_train.set_index("ds")['yhat'])
prophet_mae_train = mean_absolute_error(data_train_prophet.set_index("ds")['y'], y_pred_prophet_train.set_index("ds")['yhat'])
prophet_mape_train = mean_absolute_percentage_error(data_train_prophet.set_index("ds")['y'], y_pred_prophet_train.set_index("ds")['yhat'])
prophet_rmse_train = mean_squared_error(data_train_prophet.set_index("ds")['y'], y_pred_prophet_train.set_index("ds")['yhat'],squared=False)
evaluation_prophet_model = {
"Models":["Prophet"],
"R2_score_train":[prophet_r2score_train],
"R2_score_test":[prophet_r2score_test],
"MAE_train":[prophet_mae_train],
"MAE_test":[prophet_mae_test],
"RMSE_train":[prophet_rmse_train],
"RMSE_test":[prophet_rmse_test],
"MAPE_train":[prophet_mape_train],
"MAPE_test":[prophet_mape_test],
}
evaluation_prophet_model = pd.DataFrame(evaluation_prophet_model).set_index("Models")
evaluation_prophet_model
| R2_score_train | R2_score_test | MAE_train | MAE_test | RMSE_train | RMSE_test | MAPE_train | MAPE_test | |
|---|---|---|---|---|---|---|---|---|
| Models | ||||||||
| Prophet | 0.954472 | 0.771645 | 3514.996649 | 8689.211198 | 4914.649843 | 12042.344072 | 0.09562 | 0.147361 |
pd.concat([evaluation_prophet_model,evaluation_SARIMA],axis=0)
| R2_score_train | R2_score_test | MAE_train | MAE_test | RMSE_train | RMSE_test | MAPE_train | MAPE_test | |
|---|---|---|---|---|---|---|---|---|
| Models | ||||||||
| Prophet | 0.954472 | 0.771645 | 3514.996649 | 8689.211198 | 4914.649843 | 12042.344072 | 0.095620 | 0.147361 |
| SARIMA(11,0,1)(1,1,0,12) | 0.892036 | 0.716059 | 6100.436514 | 11702.058017 | 7337.374139 | 13428.253990 | 0.161604 | 0.218682 |
| SARIMA(1,0,1)(1,1,0,12) | 0.855312 | 0.743349 | 6837.146239 | 11230.945070 | 8494.098190 | 12766.648308 | 0.167065 | 0.205828 |
#!pip install captum
from neuralprophet import NeuralProphet
Importing plotly failed. Interactive plots will not work. Importing plotly failed. Interactive plots will not work.
neuralprophet_model = NeuralProphet()
neuralprophet_model.fit(data_train_prophet,freq='M',epochs=300)
WARNING - (NP.forecaster.fit) - When Global modeling with local normalization, metrics are displayed in normalized scale. INFO - (NP.df_utils._infer_frequency) - Major frequency M corresponds to [97.222]% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - M INFO - (NP.config.init_data_params) - Setting normalization to global as only one dataframe provided for training. INFO - (NP.utils.set_auto_seasonalities) - Disabling weekly seasonality. Run NeuralProphet with weekly_seasonality=True to override this. INFO - (NP.utils.set_auto_seasonalities) - Disabling daily seasonality. Run NeuralProphet with daily_seasonality=True to override this. INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 8
Training: | | 0/? [00:00<…
WARNING - (NP.config.set_lr_finder_args) - Learning rate finder: The number of batches (5) is too small than the required number for the learning rate finder (203). The results might not be optimal.
Finding best initial lr: 0%| | 0/203 [00:00<?, ?it/s]
Training: | | 0/? [00:00<…
| train_loss | reg_loss | MAE | RMSE | Loss | RegLoss | epoch | |
|---|---|---|---|---|---|---|---|
| 0 | 0.576601 | 0.0 | 66055.007812 | 78161.195312 | 0.568585 | 0.0 | 0 |
| 1 | 0.546444 | 0.0 | 65255.726562 | 76609.328125 | 0.558355 | 0.0 | 1 |
| 2 | 0.522759 | 0.0 | 61907.750000 | 73915.914062 | 0.524516 | 0.0 | 2 |
| 3 | 0.498709 | 0.0 | 57918.320312 | 68703.906250 | 0.474571 | 0.0 | 3 |
| 4 | 0.472351 | 0.0 | 57100.070312 | 67472.382812 | 0.469975 | 0.0 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 295 | 0.005884 | 0.0 | 4735.492188 | 5506.465820 | 0.006817 | 0.0 | 295 |
| 296 | 0.005883 | 0.0 | 4212.630859 | 5122.612793 | 0.005747 | 0.0 | 296 |
| 297 | 0.005882 | 0.0 | 4343.882324 | 5290.363770 | 0.006061 | 0.0 | 297 |
| 298 | 0.005888 | 0.0 | 4226.879883 | 5246.337891 | 0.005761 | 0.0 | 298 |
| 299 | 0.005890 | 0.0 | 4303.228027 | 5340.196777 | 0.005991 | 0.0 | 299 |
300 rows × 7 columns
y_pred_neuralprophet_test = neuralprophet_model.predict(data_test_prophet)
y_pred_neuralprophet_train = neuralprophet_model.predict(data_train_prophet)
INFO - (NP.df_utils._infer_frequency) - Major frequency M corresponds to [91.667]% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - M INFO - (NP.df_utils._infer_frequency) - Major frequency M corresponds to [91.667]% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - M
Predicting: | | 0/? [00:00<…
INFO - (NP.df_utils.return_df_in_original_format) - Returning df with no ID column INFO - (NP.df_utils._infer_frequency) - Major frequency M corresponds to [97.222]% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - M INFO - (NP.df_utils._infer_frequency) - Major frequency M corresponds to [97.222]% of the data. INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - M
Predicting: | | 0/? [00:00<…
INFO - (NP.df_utils.return_df_in_original_format) - Returning df with no ID column
plt.plot(data_test_prophet.set_index("ds")['y'],label = "actual test")
plt.plot(y_pred_neuralprophet_test.set_index("ds")['yhat1'],label = "Neural prophet prediction")
plt.plot(results_sarima["Y_prediction"][9],label="predicted Testing data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x2787a2085e0>
plt.plot(data_train_prophet.set_index("ds")['y'],label = "actual data")
plt.plot(y_pred_neuralprophet_train.set_index("ds")['yhat1'],label = "Neural prophet prediction")
plt.plot(results_sarima["Y_prediction_train"][9],label="predicted training data SARIMA(1,0,1)(1,1,0,12)")
plt.legend()
<matplotlib.legend.Legend at 0x27879811300>
full_data = general_data["Sales"]
full_pred_data_neuralprophet = pd.concat([y_pred_neuralprophet_train,y_pred_neuralprophet_test],axis=0).set_index("ds")['yhat1']
full_pred_data_sarima = pd.concat([results_sarima["Y_prediction_train"][9],results_sarima["Y_prediction"][9]],axis=0)
plt.plot(full_data,label = "actual data")
plt.plot(full_pred_data_neuralprophet,label = "neural prophet prediction")
plt.plot(full_pred_data_sarima,label="SARIMA(1,0,1)(1,1,0,12) prediction")
plt.legend()
<matplotlib.legend.Legend at 0x27879810700>
#test
neuralprophet_r2score_test = r2_score(data_test_prophet.set_index("ds")['y'],y_pred_neuralprophet_test.set_index("ds")['yhat1'])
neuralprophet_mae_test = mean_absolute_error(data_test_prophet.set_index("ds")['y'] , y_pred_neuralprophet_test.set_index("ds")['yhat1'])
neuralprophet_mape_test = mean_absolute_percentage_error(data_test_prophet.set_index("ds")['y'] , y_pred_neuralprophet_test.set_index("ds")['yhat1'])
neuralprophet_rmse_test = mean_squared_error(data_test_prophet.set_index("ds")['y'] , y_pred_neuralprophet_test.set_index("ds")['yhat1'],squared=False)
#train
neuralprophet_r2score_train = r2_score(data_train_prophet.set_index("ds")['y'], y_pred_neuralprophet_train.set_index("ds")['yhat1'])
neuralprophet_mae_train = mean_absolute_error(data_train_prophet.set_index("ds")['y'], y_pred_neuralprophet_train.set_index("ds")['yhat1'])
neuralprophet_mape_train = mean_absolute_percentage_error(data_train_prophet.set_index("ds")['y'], y_pred_neuralprophet_train.set_index("ds")['yhat1'])
neuralprophet_rmse_train = mean_squared_error(data_train_prophet.set_index("ds")['y'], y_pred_neuralprophet_train.set_index("ds")['yhat1'],squared=False)
evaluation_neuralprophet_model = {
"Models":["Neural Prophet"],
"R2_score_train":[neuralprophet_r2score_train],
"R2_score_test":[neuralprophet_r2score_test],
"MAE_train":[neuralprophet_mae_train],
"MAE_test":[neuralprophet_mae_test],
"RMSE_train":[neuralprophet_rmse_train],
"RMSE_test":[neuralprophet_rmse_test],
"MAPE_train":[neuralprophet_mape_train],
"MAPE_test":[neuralprophet_mape_test],
}
evaluation_neuralprophet_model = pd.DataFrame(evaluation_neuralprophet_model).set_index("Models")
evaluation_neuralprophet_model
| R2_score_train | R2_score_test | MAE_train | MAE_test | RMSE_train | RMSE_test | MAPE_train | MAPE_test | |
|---|---|---|---|---|---|---|---|---|
| Models | ||||||||
| Neural Prophet | 0.944283 | 0.810693 | 4245.915709 | 9707.271683 | 5436.871599 | 10964.514707 | 0.129653 | 0.188317 |
pd.concat([evaluation_prophet_model,evaluation_neuralprophet_model,evaluation_SARIMA],axis=0)
| R2_score_train | R2_score_test | MAE_train | MAE_test | RMSE_train | RMSE_test | MAPE_train | MAPE_test | |
|---|---|---|---|---|---|---|---|---|
| Models | ||||||||
| Prophet | 0.954472 | 0.771645 | 3514.996649 | 8689.211198 | 4914.649843 | 12042.344072 | 0.095620 | 0.147361 |
| Neural Prophet | 0.944283 | 0.810693 | 4245.915709 | 9707.271683 | 5436.871599 | 10964.514707 | 0.129653 | 0.188317 |
| SARIMA(11,0,1)(1,1,0,12) | 0.892036 | 0.716059 | 6100.436514 | 11702.058017 | 7337.374139 | 13428.253990 | 0.161604 | 0.218682 |
| SARIMA(1,0,1)(1,1,0,12) | 0.855312 | 0.743349 | 6837.146239 | 11230.945070 | 8494.098190 | 12766.648308 | 0.167065 | 0.205828 |
general_data.head()
| Sales | |
|---|---|
| 2011-01-31 | 13946.229 |
| 2011-02-28 | 4810.558 |
| 2011-03-31 | 55691.009 |
| 2011-04-30 | 28295.345 |
| 2011-05-31 | 23648.287 |
general_data["Sales_pct_change"] = general_data["Sales"].pct_change()
general_data.head()
| Sales | Sales_pct_change | |
|---|---|---|
| 2011-01-31 | 13946.229 | NaN |
| 2011-02-28 | 4810.558 | -0.655064 |
| 2011-03-31 | 55691.009 | 10.576829 |
| 2011-04-30 | 28295.345 | -0.491923 |
| 2011-05-31 | 23648.287 | -0.164234 |
general_data2 = general_data[1:]
vs.lineplot(general_data.drop("Sales",axis=1),"Monthly")
Q1 = general_data['Sales_pct_change'].quantile(0.25)
Q3 = general_data['Sales_pct_change'].quantile(0.75)
IQR = Q3 - Q1
print("IQR =",IQR)
IQR = 0.6794344230190767
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
general_data['is_outlier'] = (general_data['Sales_pct_change'] < lower_bound) | (general_data['Sales_pct_change'] > upper_bound)
general_data.head()
| Sales | Sales_pct_change | is_outlier | |
|---|---|---|---|
| 2011-01-31 | 13946.229 | NaN | False |
| 2011-02-28 | 4810.558 | -0.655064 | False |
| 2011-03-31 | 55691.009 | 10.576829 | True |
| 2011-04-30 | 28295.345 | -0.491923 | False |
| 2011-05-31 | 23648.287 | -0.164234 | False |
general_data.is_outlier.value_counts()
is_outlier False 43 True 5 Name: count, dtype: int64
So, we observe that there are 5 outliers . let's plot them to better understand if they are really outliers
list_outliers = []
list_idx_outliers = []
for i,x in enumerate(general_data.is_outlier):
if x :
list_outliers.append(general_data["Sales_pct_change"][i])
list_idx_outliers.append(general_data["Sales_pct_change"].index[i])
list_outliers = pd.Series(list_outliers)
list_outliers.index = list_idx_outliers
list_outliers
2011-03-31 10.576829 2011-09-30 1.930093 2011-11-30 1.499848 2012-03-31 2.150210 2014-03-31 1.657772 dtype: float64
#Visualisation
plt.plot(general_data["Sales_pct_change"],label="percentage change")
plt.scatter(list_outliers.index,list_outliers,label="outliers",color="r")
plt.legend()
<matplotlib.legend.Legend at 0x2787a6c5ab0>
#visuliaze outliers in the real data
outliers = [x for i,x in enumerate(general_data["Sales"]) if general_data["Sales"].index[i] in list_outliers.index ]
outliers
#Visualisation
plt.plot(general_data["Sales"],label="Sales")
plt.scatter(list_outliers.index,outliers,label="outliers",color="r")
plt.xticks(ticks=general_data.index,label=[general_data.index],rotation=90)
plt.legend()
<matplotlib.legend.Legend at 0x2787a706110>
=> the picks seem logical and seasonal. I won't consider them outliers.
vs.trend_plot(general_data.drop(["Sales","is_outlier"],axis=1)[1:],"Monthly")
vs.seasonality_plot(general_data.drop(["Sales","is_outlier"],axis=1)[1:],"Monthly")
vs.acf_plot(general_data.drop(["Sales","is_outlier"],axis=1)[1:],"Monthly")
vs.pacf_plot(general_data.drop(["Sales","is_outlier"],axis=1)[1:],"Monthly")
#after diffrenciating (trend and seasonality )
vs.trend_plot(general_data.drop(["Sales","is_outlier"],axis=1)[1:].diff().dropna(),"Monthly")
vs.seasonality_plot(general_data.drop(["Sales","is_outlier"],axis=1)[1:].diff().dropna(),"Monthly")
=> seasnality : mars -> mars => S = 12
data_train_pct , data_test_pct = train.split_data(general_data,0.75)
result_pct_change = train.adf_test(general_data["Sales_pct_change"][1:])
result_pct_change
| Statistical test | P-values | Lags used | Number of observations | |
|---|---|---|---|---|
| 0 | -1.731106 | 0.415161 | 10 | 36 |
H0: data is not stationnary
H1: data is stationnary
p value > 0.05 => fail to reject H0 => so based on ADF test data is not stationnary
we'll differenciate the data and check acf and pacf plot again
result_pct_change_diff = train.adf_test(general_data["Sales_pct_change"][1:].diff().dropna())
result_pct_change_diff
| Statistical test | P-values | Lags used | Number of observations | |
|---|---|---|---|---|
| 0 | -4.987877 | 0.000023 | 9 | 36 |
=> p-value < 0.05 => the date is stationnary (d=1)
vs.acf_plot(general_data.drop(["Sales","is_outlier"],axis=1)[1:].diff().dropna(),"Monthly")
vs.pacf_plot(general_data.drop(["Sales","is_outlier"],axis=1)[1:].diff().dropna(),"Monthly")
"""pacf_segment , _ = pacf(general_data.drop(["Sales","is_outlier"],axis=1)[1:].diff().dropna(),alpha=0.05)
print("before")
print(pacf_segment)
pacf_segment = [lag for lag,value in enumerate(pacf_segment) if abs(value)> 0.25]
pacf_segment"""
'pacf_segment , _ = pacf(general_data.drop(["Sales","is_outlier"],axis=1)[1:].diff().dropna(),alpha=0.05)\nprint("before")\nprint(pacf_segment)\npacf_segment = [lag for lag,value in enumerate(pacf_segment) if abs(value)> 0.25]\npacf_segment'
models_arima_pct = train.train_arima(general_data.drop(["Sales","is_outlier"],axis=1)[1:],0.75)
0%| | 0/1 [00:00<?, ?it/s]
ADF Statistic: -1.7311063266224096 p-value: 0.4151609992562723 acf [0, 1, 6, 12] pacf [0, 1, 2, 3, 5, 7, 11, 12, 13, 14, 15, 16] (ar,d,ma) : (0, 1, 0) (ar,d,ma) : (0, 1, 1) (ar,d,ma) : (0, 1, 6) (ar,d,ma) : (0, 1, 12) (ar,d,ma) : (1, 1, 0) (ar,d,ma) : (1, 1, 1) (ar,d,ma) : (1, 1, 6) (ar,d,ma) : (1, 1, 12) (ar,d,ma) : (2, 1, 0) (ar,d,ma) : (2, 1, 1) (ar,d,ma) : (2, 1, 6) (ar,d,ma) : (2, 1, 12) (ar,d,ma) : (3, 1, 0) (ar,d,ma) : (3, 1, 1) (ar,d,ma) : (3, 1, 6) (ar,d,ma) : (3, 1, 12) (ar,d,ma) : (5, 1, 0) (ar,d,ma) : (5, 1, 1) (ar,d,ma) : (5, 1, 6) (ar,d,ma) : (5, 1, 12) (ar,d,ma) : (7, 1, 0) (ar,d,ma) : (7, 1, 1) (ar,d,ma) : (7, 1, 6) (ar,d,ma) : (7, 1, 12) (ar,d,ma) : (11, 1, 0) (ar,d,ma) : (11, 1, 1) (ar,d,ma) : (11, 1, 6) (ar,d,ma) : (11, 1, 12) (ar,d,ma) : (12, 1, 0) (ar,d,ma) : (12, 1, 1) (ar,d,ma) : (12, 1, 6) (ar,d,ma) : (12, 1, 12) (ar,d,ma) : (13, 1, 0) (ar,d,ma) : (13, 1, 1) (ar,d,ma) : (13, 1, 6) (ar,d,ma) : (13, 1, 12) (ar,d,ma) : (14, 1, 0) (ar,d,ma) : (14, 1, 1) (ar,d,ma) : (14, 1, 6) (ar,d,ma) : (14, 1, 12) (ar,d,ma) : (15, 1, 0) (ar,d,ma) : (15, 1, 1) (ar,d,ma) : (15, 1, 6) (ar,d,ma) : (15, 1, 12) (ar,d,ma) : (16, 1, 0) (ar,d,ma) : (16, 1, 1) (ar,d,ma) : (16, 1, 6) (ar,d,ma) : (16, 1, 12)
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [01:41<00:00, 101.63s/it]
max_r2score_train_pct = max(models_arima_pct["R2_Score_train"])
max_r2score_test_pct = max(models_arima_pct["R2_Score"])
min_mape_train_pct =min(models_arima_pct["MAPE_train"])
min_mape_test_pct = min(models_arima_pct["MAPE"])
print(f"the hightest R2_score (on train data , to test how much the model fits the training data ): {max_r2score_train_pct}")
print(f"the hightest R2_score on test data : {max_r2score_test_pct}")
print()
print(f"the lowest mean absolute percentage error on trainset: {min_mape_train_pct}")
print(f"the lowest mean absolute percentage error on test set: {min_mape_test_pct}")
the hightest R2_score (on train data , to test how much the model fits the training data ): -1.2765190841612815 the hightest R2_score on test data : 0.6851646812647361 the lowest mean absolute percentage error on trainset: 1.2305482507603642 the lowest mean absolute percentage error on test set: 1.3890398092636291
print("the model that best fits the training set :")
models_arima_pct[models_arima_pct["R2_Score_train"] == max_r2score_train_pct]
the model that best fits the training set :
| Model | Segment | ACF | d | PACF | Order | MAE | MAPE | RMSE | R2_Score | MAE_train | MAPE_train | R2_Score_train | RMSE_train | Y_test | Y_prediction | Y_prediction_train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales_pct_change | 1 | 1 | 0 | (0, 1, 1) | 0.643857 | 1.38904 | 0.933977 | -0.003621 | 1.936415 | 1.230548 | -1.276519 | 4.459963 | 2014-01-31 -0.540268 2014-02-28 -0.546262 ... | 2014-01-31 0.207683 2014-02-28 0.232318 ... | 2011-03-31 -0.655064 2011-04-30 10.57672... |
print("The model with best performance on test set :")
models_arima_pct[models_arima_pct["R2_Score"] == max_r2score_test_pct]
The model with best performance on test set :
| Model | Segment | ACF | d | PACF | Order | MAE | MAPE | RMSE | R2_Score | MAE_train | MAPE_train | R2_Score_train | RMSE_train | Y_test | Y_prediction | Y_prediction_train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 21 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales_pct_change | 1 | 1 | 7 | (7, 1, 1) | 0.471725 | 6.513956 | 0.52311 | 0.685165 | 2.147307 | 3.009751 | -1.555854 | 4.725672 | 2014-01-31 -0.540268 2014-02-28 -0.546262 ... | 2014-01-31 -0.004319 2014-02-28 -0.348517 ... | 2011-03-31 -0.655064 2011-04-30 10.57657... |
results_sarima_pct = train.train_sarima(general_data.drop(["Sales","is_outlier"],axis=1)[1:],0.75)
max_r2score_sarima_train_pct = max(results_sarima_pct["R2_Score_train"])
max_r2score_sarima_test_pct = max(results_sarima_pct["R2_Score"])
min_mape_sarima_train_pct =min(results_sarima_pct["MAPE_train"])
min_mape_sarima_test_pct = min(results_sarima_pct["MAPE"])
print(f"the hightest R2_score (on train data , to test how much the model fits the training data ): {max_r2score_sarima_train_pct}")
print(f"the hightest R2_score on test data : {max_r2score_sarima_test_pct}")
print()
print(f"the lowest mean absolute percentage error on trainset: {min_mape_sarima_train_pct}")
print(f"the lowest mean absolute percentage error on test set: {min_mape_sarima_test_pct}")
the hightest R2_score (on train data , to test how much the model fits the training data ): -10.55856709875193 the hightest R2_score on test data : 0.5204471015367813 the lowest mean absolute percentage error on trainset: 4.1633515437696955 the lowest mean absolute percentage error on test set: 3.161813943989401
print("the model that best fits the training set :")
results_sarima_pct[results_sarima_pct["R2_Score_train"] == max_r2score_sarima_train_pct]
the model that best fits the training set :
| Model | Segment | ACF | d | PACF | Order | P | D | Q | S | ... | MAPE | RMSE | R2_Score | MAE_train | MAPE_train | R2_Score_train | RMSE_train | Y_test | Y_prediction | Y_prediction_train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales_pct_change | 1.0 | 1.0 | 0.0 | (0, 1, 1) | 2.0 | 1.0 | 2.0 | 12.0 | ... | 18.5326 | 1.820453 | -2.812906 | 1.623677 | 4.163352 | -10.558567 | 3.909592 | 2014-01-31 -0.540268 2014-02-28 -0.546262 ... | 2014-01-31 -0.584346 2014-02-28 0.541102 ... | 2012-03-31 5.371631 2012-04-30 -14.45074... |
1 rows × 21 columns
print("The model with best performance on test set :")
results_sarima_pct[results_sarima_pct["R2_Score"] == max_r2score_sarima_test_pct]
The model with best performance on test set :
| Model | Segment | ACF | d | PACF | Order | P | D | Q | S | ... | MAPE | RMSE | R2_Score | MAE_train | MAPE_train | R2_Score_train | RMSE_train | Y_test | Y_prediction | Y_prediction_train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | <statsmodels.tsa.arima.model.ARIMAResultsWrapp... | Sales_pct_change | 6.0 | 1.0 | 1.0 | (1, 1, 6) | 0.0 | 1.0 | 1.0 | 12.0 | ... | 10.286225 | 0.645609 | 0.520447 | 1.858955 | 6.672035 | -11.355593 | 4.042139 | 2014-01-31 -0.540268 2014-02-28 -0.546262 ... | 2014-01-31 -0.840695 2014-02-28 -0.203079 ... | 2012-03-31 5.371844 2012-04-30 -14.45064... |
1 rows × 21 columns
from pmdarima import auto_arima
auto_arima_pctchange = auto_arima(data_train_pct["Sales_pct_change"][1:],m=12,
max_order=None, max_p=12,max_d=2,max_q=12,
max_P=4,max_D=2,max_Q=4,
maxiter=500, alpha=0.05, n_jobs= -1,trend=[1,1,1,1])
auto_arima_pctchange.summary()
| Dep. Variable: | y | No. Observations: | 35 |
|---|---|---|---|
| Model: | SARIMAX(1, 0, 0) | Log Likelihood | -65.119 |
| Date: | Sun, 23 Jun 2024 | AIC | 142.238 |
| Time: | 13:18:39 | BIC | 151.570 |
| Sample: | 02-28-2011 | HQIC | 145.460 |
| - 12-31-2013 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| intercept | 6.1531 | 0.635 | 9.684 | 0.000 | 4.908 | 7.398 |
| drift | -1.1241 | 0.279 | -4.029 | 0.000 | -1.671 | -0.577 |
| trend.2 | 0.0611 | 0.023 | 2.674 | 0.008 | 0.016 | 0.106 |
| trend.3 | -0.0010 | 0.000 | -2.050 | 0.040 | -0.002 | -4.36e-05 |
| ar.L1 | -0.4409 | 0.173 | -2.546 | 0.011 | -0.780 | -0.101 |
| sigma2 | 2.0270 | 0.394 | 5.144 | 0.000 | 1.255 | 2.799 |
| Ljung-Box (L1) (Q): | 0.04 | Jarque-Bera (JB): | 16.21 |
|---|---|---|---|
| Prob(Q): | 0.85 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 0.10 | Skew: | 0.45 |
| Prob(H) (two-sided): | 0.00 | Kurtosis: | 6.21 |
=> so based on auto_arima the best model is ARIMA(1, 0, 0)
model_pct = ARIMA(data_train_pct['Sales_pct_change'][1:], order=(1, 0, 0)).fit()
model_pct.summary()
| Dep. Variable: | Sales_pct_change | No. Observations: | 35 |
|---|---|---|---|
| Model: | ARIMA(1, 0, 0) | Log Likelihood | -70.936 |
| Date: | Sun, 23 Jun 2024 | AIC | 147.872 |
| Time: | 13:18:39 | BIC | 152.538 |
| Sample: | 02-28-2011 | HQIC | 149.483 |
| - 12-31-2013 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 0.4947 | 0.670 | 0.738 | 0.461 | -0.819 | 1.809 |
| ar.L1 | -0.2063 | 0.296 | -0.696 | 0.486 | -0.787 | 0.374 |
| sigma2 | 3.3681 | 0.688 | 4.897 | 0.000 | 2.020 | 4.716 |
| Ljung-Box (L1) (Q): | 0.02 | Jarque-Bera (JB): | 742.42 |
|---|---|---|---|
| Prob(Q): | 0.90 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 0.05 | Skew: | 4.40 |
| Prob(H) (two-sided): | 0.00 | Kurtosis: | 23.78 |
=> we observe that the p-value of ar.L1 = 0.48 > 0.05 => this means that this model will return a misleading result
y_pred_train_best_AAM_pct = model_pct.predict()
y_pred_test_best_AAM_pct = model_pct.forecast(len(data_test_pct["Sales_pct_change"]))
#y_pred_test_arima_pct =train.reverse_diff(data_train_pct["Sales"][1:],y_pred_test_arima_pct,0,"test")
r2_score(data_train_pct["Sales_pct_change"][1:],y_pred_train_best_AAM_pct)
0.04402237321798519
#train
best_auto_arima_model_pct_r2score_train = r2_score(data_train_pct["Sales_pct_change"][1:],y_pred_train_best_AAM_pct)
best_auto_arima_model_pct_mae_train = mean_absolute_error(data_train_pct["Sales_pct_change"][1:],y_pred_train_best_AAM_pct)
best_auto_arima_model_pct_mape_train = mean_absolute_percentage_error(data_train_pct["Sales_pct_change"][1:],y_pred_train_best_AAM_pct)
best_auto_arima_model_pct_rmse_train = mean_squared_error(data_train_pct["Sales_pct_change"][1:],y_pred_train_best_AAM_pct)
#test
best_auto_arima_model_pct_r2score_test = r2_score(data_test_pct["Sales_pct_change"],y_pred_test_best_AAM_pct)
best_auto_arima_model_pct_mae_test = mean_absolute_error(data_test_pct["Sales_pct_change"],y_pred_test_best_AAM_pct)
best_auto_arima_model_pct_mape_test = mean_absolute_percentage_error(data_test_pct["Sales_pct_change"],y_pred_test_best_AAM_pct)
best_auto_arima_model_pct_rmse_test = mean_squared_error(data_test_pct["Sales_pct_change"],y_pred_test_best_AAM_pct)
evaluation_best_auto_arima_model_pct = {
"Models":["ARIMA(1, 0, 0)"],
"R2_score_train":[best_auto_arima_model_pct_r2score_train],
"R2_score_test":[best_auto_arima_model_pct_r2score_test],
"MAE_train":[best_auto_arima_model_pct_mae_train],
"MAE_test":[best_auto_arima_model_pct_mae_test],
"RMSE_train":[best_auto_arima_model_pct_rmse_train],
"RMSE_test":[best_auto_arima_model_pct_rmse_test],
"MAPE_train":[best_auto_arima_model_pct_mape_train],
"MAPE_test":[best_auto_arima_model_pct_mape_test],
}
evaluation_best_auto_arima_model_pct = pd.DataFrame(evaluation_best_auto_arima_model_pct).set_index("Models")
evaluation_best_auto_arima_model_pct
| R2_score_train | R2_score_test | MAE_train | MAE_test | RMSE_train | RMSE_test | MAPE_train | MAPE_test | |
|---|---|---|---|---|---|---|---|---|
| Models | ||||||||
| ARIMA(1, 0, 0) | 0.044022 | -0.482695 | 0.960065 | 0.5788 | 3.36977 | 0.472072 | 4.147281 | 13.952267 |
using pct_change to standarize data does not improve the model's performance